chansung commited on
Commit
b361c75
·
verified ·
1 Parent(s): 9bc5672

Model save

Browse files
README.md ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ license: llama3.2
4
+ base_model: meta-llama/Llama-3.2-3B
5
+ tags:
6
+ - trl
7
+ - sft
8
+ - generated_from_trainer
9
+ datasets:
10
+ - generator
11
+ model-index:
12
+ - name: llama3-3b-classification-gpt4o-100k
13
+ results: []
14
+ ---
15
+
16
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
+ should probably proofread and complete it, then remove this comment. -->
18
+
19
+ # llama3-3b-classification-gpt4o-100k
20
+
21
+ This model is a fine-tuned version of [meta-llama/Llama-3.2-3B](https://huggingface.co/meta-llama/Llama-3.2-3B) on the generator dataset.
22
+ It achieves the following results on the evaluation set:
23
+ - Loss: 2.2580
24
+
25
+ ## Model description
26
+
27
+ More information needed
28
+
29
+ ## Intended uses & limitations
30
+
31
+ More information needed
32
+
33
+ ## Training and evaluation data
34
+
35
+ More information needed
36
+
37
+ ## Training procedure
38
+
39
+ ### Training hyperparameters
40
+
41
+ The following hyperparameters were used during training:
42
+ - learning_rate: 0.0002
43
+ - train_batch_size: 16
44
+ - eval_batch_size: 16
45
+ - seed: 42
46
+ - distributed_type: multi-GPU
47
+ - num_devices: 8
48
+ - gradient_accumulation_steps: 2
49
+ - total_train_batch_size: 256
50
+ - total_eval_batch_size: 128
51
+ - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
52
+ - lr_scheduler_type: cosine
53
+ - lr_scheduler_warmup_ratio: 0.1
54
+ - num_epochs: 10
55
+
56
+ ### Training results
57
+
58
+ | Training Loss | Epoch | Step | Validation Loss |
59
+ |:-------------:|:-----:|:----:|:---------------:|
60
+ | 2.1922 | 1.0 | 30 | 2.2939 |
61
+ | 1.9883 | 2.0 | 60 | 2.2309 |
62
+ | 1.9261 | 3.0 | 90 | 2.2480 |
63
+ | 1.877 | 4.0 | 120 | 2.2464 |
64
+ | 1.8508 | 5.0 | 150 | 2.2514 |
65
+ | 1.8357 | 6.0 | 180 | 2.2442 |
66
+ | 1.8225 | 7.0 | 210 | 2.2535 |
67
+ | 1.8153 | 8.0 | 240 | 2.2560 |
68
+ | 1.8065 | 9.0 | 270 | 2.2574 |
69
+ | 1.8084 | 10.0 | 300 | 2.2580 |
70
+
71
+
72
+ ### Framework versions
73
+
74
+ - PEFT 0.15.1
75
+ - Transformers 4.50.3
76
+ - Pytorch 2.6.0+cu124
77
+ - Datasets 3.5.0
78
+ - Tokenizers 0.21.1
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3169596dca053f259ea4bc84a28e177f77a6acb719b7d3d6336655c9b6320326
3
  size 1612749744
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e9fe231b4e6cb22f135f37cf2ba422fdca942aa24a0f807f335dd194d6c8922
3
  size 1612749744
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "total_flos": 1.3343811847501906e+18,
4
+ "train_loss": 1.9245261510213216,
5
+ "train_runtime": 1578.2466,
6
+ "train_samples": 92634,
7
+ "train_samples_per_second": 47.958,
8
+ "train_steps_per_second": 0.19
9
+ }
runs/Apr01_00-42-00_green-face-echoes-fin-01/events.out.tfevents.1743468298.green-face-echoes-fin-01.8410.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6ef243dd0bd14aa94ddd12ff126fe2eb1b9ec006df6b04aa0c7a9d3feb99deec
3
- size 16443
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db9596aa9ef447df0d8b1f695cd826840254e5eea46b5b5759dfd386279e5ec8
3
+ size 22101
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "total_flos": 1.3343811847501906e+18,
4
+ "train_loss": 1.9245261510213216,
5
+ "train_runtime": 1578.2466,
6
+ "train_samples": 92634,
7
+ "train_samples_per_second": 47.958,
8
+ "train_steps_per_second": 0.19
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,550 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 10.0,
6
+ "eval_steps": 500,
7
+ "global_step": 300,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.03333333333333333,
14
+ "grad_norm": 0.8721705675125122,
15
+ "learning_rate": 6.666666666666667e-06,
16
+ "loss": 2.5652,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.16666666666666666,
21
+ "grad_norm": 0.7771564722061157,
22
+ "learning_rate": 3.3333333333333335e-05,
23
+ "loss": 2.5757,
24
+ "step": 5
25
+ },
26
+ {
27
+ "epoch": 0.3333333333333333,
28
+ "grad_norm": 0.4726114273071289,
29
+ "learning_rate": 6.666666666666667e-05,
30
+ "loss": 2.5393,
31
+ "step": 10
32
+ },
33
+ {
34
+ "epoch": 0.5,
35
+ "grad_norm": 0.41005024313926697,
36
+ "learning_rate": 0.0001,
37
+ "loss": 2.4619,
38
+ "step": 15
39
+ },
40
+ {
41
+ "epoch": 0.6666666666666666,
42
+ "grad_norm": 0.359015554189682,
43
+ "learning_rate": 0.00013333333333333334,
44
+ "loss": 2.3546,
45
+ "step": 20
46
+ },
47
+ {
48
+ "epoch": 0.8333333333333334,
49
+ "grad_norm": 0.32825127243995667,
50
+ "learning_rate": 0.0001666666666666667,
51
+ "loss": 2.2723,
52
+ "step": 25
53
+ },
54
+ {
55
+ "epoch": 1.0,
56
+ "grad_norm": 0.2509020268917084,
57
+ "learning_rate": 0.0002,
58
+ "loss": 2.1922,
59
+ "step": 30
60
+ },
61
+ {
62
+ "epoch": 1.0,
63
+ "eval_loss": 2.293870449066162,
64
+ "eval_runtime": 0.868,
65
+ "eval_samples_per_second": 2.304,
66
+ "eval_steps_per_second": 1.152,
67
+ "step": 30
68
+ },
69
+ {
70
+ "epoch": 1.1666666666666667,
71
+ "grad_norm": 0.17683961987495422,
72
+ "learning_rate": 0.00019983081582712685,
73
+ "loss": 2.1372,
74
+ "step": 35
75
+ },
76
+ {
77
+ "epoch": 1.3333333333333333,
78
+ "grad_norm": 0.13620507717132568,
79
+ "learning_rate": 0.00019932383577419432,
80
+ "loss": 2.092,
81
+ "step": 40
82
+ },
83
+ {
84
+ "epoch": 1.5,
85
+ "grad_norm": 0.1294490098953247,
86
+ "learning_rate": 0.00019848077530122083,
87
+ "loss": 2.0612,
88
+ "step": 45
89
+ },
90
+ {
91
+ "epoch": 1.6666666666666665,
92
+ "grad_norm": 0.12196648120880127,
93
+ "learning_rate": 0.00019730448705798239,
94
+ "loss": 2.029,
95
+ "step": 50
96
+ },
97
+ {
98
+ "epoch": 1.8333333333333335,
99
+ "grad_norm": 0.12746499478816986,
100
+ "learning_rate": 0.0001957989512315489,
101
+ "loss": 2.0055,
102
+ "step": 55
103
+ },
104
+ {
105
+ "epoch": 2.0,
106
+ "grad_norm": 0.11664091795682907,
107
+ "learning_rate": 0.00019396926207859084,
108
+ "loss": 1.9883,
109
+ "step": 60
110
+ },
111
+ {
112
+ "epoch": 2.0,
113
+ "eval_loss": 2.2309093475341797,
114
+ "eval_runtime": 0.8673,
115
+ "eval_samples_per_second": 2.306,
116
+ "eval_steps_per_second": 1.153,
117
+ "step": 60
118
+ },
119
+ {
120
+ "epoch": 2.1666666666666665,
121
+ "grad_norm": 0.12395822256803513,
122
+ "learning_rate": 0.00019182161068802741,
123
+ "loss": 1.9631,
124
+ "step": 65
125
+ },
126
+ {
127
+ "epoch": 2.3333333333333335,
128
+ "grad_norm": 0.14801518619060516,
129
+ "learning_rate": 0.00018936326403234125,
130
+ "loss": 1.9482,
131
+ "step": 70
132
+ },
133
+ {
134
+ "epoch": 2.5,
135
+ "grad_norm": 0.12537938356399536,
136
+ "learning_rate": 0.00018660254037844388,
137
+ "loss": 1.9543,
138
+ "step": 75
139
+ },
140
+ {
141
+ "epoch": 2.6666666666666665,
142
+ "grad_norm": 0.13340197503566742,
143
+ "learning_rate": 0.00018354878114129367,
144
+ "loss": 1.9364,
145
+ "step": 80
146
+ },
147
+ {
148
+ "epoch": 2.8333333333333335,
149
+ "grad_norm": 0.16110606491565704,
150
+ "learning_rate": 0.0001802123192755044,
151
+ "loss": 1.9286,
152
+ "step": 85
153
+ },
154
+ {
155
+ "epoch": 3.0,
156
+ "grad_norm": 0.14026618003845215,
157
+ "learning_rate": 0.0001766044443118978,
158
+ "loss": 1.9261,
159
+ "step": 90
160
+ },
161
+ {
162
+ "epoch": 3.0,
163
+ "eval_loss": 2.248041868209839,
164
+ "eval_runtime": 0.8685,
165
+ "eval_samples_per_second": 2.303,
166
+ "eval_steps_per_second": 1.151,
167
+ "step": 90
168
+ },
169
+ {
170
+ "epoch": 3.1666666666666665,
171
+ "grad_norm": 0.13984443247318268,
172
+ "learning_rate": 0.00017273736415730488,
173
+ "loss": 1.9053,
174
+ "step": 95
175
+ },
176
+ {
177
+ "epoch": 3.3333333333333335,
178
+ "grad_norm": 0.17301388084888458,
179
+ "learning_rate": 0.0001686241637868734,
180
+ "loss": 1.9071,
181
+ "step": 100
182
+ },
183
+ {
184
+ "epoch": 3.5,
185
+ "grad_norm": 0.15998174250125885,
186
+ "learning_rate": 0.00016427876096865394,
187
+ "loss": 1.896,
188
+ "step": 105
189
+ },
190
+ {
191
+ "epoch": 3.6666666666666665,
192
+ "grad_norm": 0.17066697776317596,
193
+ "learning_rate": 0.00015971585917027862,
194
+ "loss": 1.891,
195
+ "step": 110
196
+ },
197
+ {
198
+ "epoch": 3.8333333333333335,
199
+ "grad_norm": 0.18711982667446136,
200
+ "learning_rate": 0.0001549508978070806,
201
+ "loss": 1.8784,
202
+ "step": 115
203
+ },
204
+ {
205
+ "epoch": 4.0,
206
+ "grad_norm": 0.18189194798469543,
207
+ "learning_rate": 0.00015000000000000001,
208
+ "loss": 1.877,
209
+ "step": 120
210
+ },
211
+ {
212
+ "epoch": 4.0,
213
+ "eval_loss": 2.246403217315674,
214
+ "eval_runtime": 0.8876,
215
+ "eval_samples_per_second": 2.253,
216
+ "eval_steps_per_second": 1.127,
217
+ "step": 120
218
+ },
219
+ {
220
+ "epoch": 4.166666666666667,
221
+ "grad_norm": 0.19101205468177795,
222
+ "learning_rate": 0.00014487991802004623,
223
+ "loss": 1.8693,
224
+ "step": 125
225
+ },
226
+ {
227
+ "epoch": 4.333333333333333,
228
+ "grad_norm": 0.18224620819091797,
229
+ "learning_rate": 0.0001396079766039157,
230
+ "loss": 1.8717,
231
+ "step": 130
232
+ },
233
+ {
234
+ "epoch": 4.5,
235
+ "grad_norm": 0.21038778126239777,
236
+ "learning_rate": 0.00013420201433256689,
237
+ "loss": 1.865,
238
+ "step": 135
239
+ },
240
+ {
241
+ "epoch": 4.666666666666667,
242
+ "grad_norm": 0.20972222089767456,
243
+ "learning_rate": 0.00012868032327110904,
244
+ "loss": 1.8534,
245
+ "step": 140
246
+ },
247
+ {
248
+ "epoch": 4.833333333333333,
249
+ "grad_norm": 0.20562510192394257,
250
+ "learning_rate": 0.00012306158707424403,
251
+ "loss": 1.8507,
252
+ "step": 145
253
+ },
254
+ {
255
+ "epoch": 5.0,
256
+ "grad_norm": 0.18542522192001343,
257
+ "learning_rate": 0.00011736481776669306,
258
+ "loss": 1.8508,
259
+ "step": 150
260
+ },
261
+ {
262
+ "epoch": 5.0,
263
+ "eval_loss": 2.251426935195923,
264
+ "eval_runtime": 0.8697,
265
+ "eval_samples_per_second": 2.3,
266
+ "eval_steps_per_second": 1.15,
267
+ "step": 150
268
+ },
269
+ {
270
+ "epoch": 5.166666666666667,
271
+ "grad_norm": 0.19300299882888794,
272
+ "learning_rate": 0.00011160929141252303,
273
+ "loss": 1.8441,
274
+ "step": 155
275
+ },
276
+ {
277
+ "epoch": 5.333333333333333,
278
+ "grad_norm": 0.17082081735134125,
279
+ "learning_rate": 0.00010581448289104758,
280
+ "loss": 1.8406,
281
+ "step": 160
282
+ },
283
+ {
284
+ "epoch": 5.5,
285
+ "grad_norm": 0.16670560836791992,
286
+ "learning_rate": 0.0001,
287
+ "loss": 1.8314,
288
+ "step": 165
289
+ },
290
+ {
291
+ "epoch": 5.666666666666667,
292
+ "grad_norm": 0.2005225419998169,
293
+ "learning_rate": 9.418551710895243e-05,
294
+ "loss": 1.84,
295
+ "step": 170
296
+ },
297
+ {
298
+ "epoch": 5.833333333333333,
299
+ "grad_norm": 0.17486542463302612,
300
+ "learning_rate": 8.839070858747697e-05,
301
+ "loss": 1.8356,
302
+ "step": 175
303
+ },
304
+ {
305
+ "epoch": 6.0,
306
+ "grad_norm": 0.17715969681739807,
307
+ "learning_rate": 8.263518223330697e-05,
308
+ "loss": 1.8357,
309
+ "step": 180
310
+ },
311
+ {
312
+ "epoch": 6.0,
313
+ "eval_loss": 2.244203805923462,
314
+ "eval_runtime": 0.8698,
315
+ "eval_samples_per_second": 2.299,
316
+ "eval_steps_per_second": 1.15,
317
+ "step": 180
318
+ },
319
+ {
320
+ "epoch": 6.166666666666667,
321
+ "grad_norm": 0.1805175244808197,
322
+ "learning_rate": 7.693841292575598e-05,
323
+ "loss": 1.8293,
324
+ "step": 185
325
+ },
326
+ {
327
+ "epoch": 6.333333333333333,
328
+ "grad_norm": 0.16339732706546783,
329
+ "learning_rate": 7.131967672889101e-05,
330
+ "loss": 1.8289,
331
+ "step": 190
332
+ },
333
+ {
334
+ "epoch": 6.5,
335
+ "grad_norm": 0.17380179464817047,
336
+ "learning_rate": 6.579798566743314e-05,
337
+ "loss": 1.8242,
338
+ "step": 195
339
+ },
340
+ {
341
+ "epoch": 6.666666666666667,
342
+ "grad_norm": 0.18039536476135254,
343
+ "learning_rate": 6.039202339608432e-05,
344
+ "loss": 1.8221,
345
+ "step": 200
346
+ },
347
+ {
348
+ "epoch": 6.833333333333333,
349
+ "grad_norm": 0.1678876280784607,
350
+ "learning_rate": 5.5120081979953785e-05,
351
+ "loss": 1.8182,
352
+ "step": 205
353
+ },
354
+ {
355
+ "epoch": 7.0,
356
+ "grad_norm": 0.18717928230762482,
357
+ "learning_rate": 5.000000000000002e-05,
358
+ "loss": 1.8225,
359
+ "step": 210
360
+ },
361
+ {
362
+ "epoch": 7.0,
363
+ "eval_loss": 2.2534749507904053,
364
+ "eval_runtime": 0.8691,
365
+ "eval_samples_per_second": 2.301,
366
+ "eval_steps_per_second": 1.151,
367
+ "step": 210
368
+ },
369
+ {
370
+ "epoch": 7.166666666666667,
371
+ "grad_norm": 0.1713215708732605,
372
+ "learning_rate": 4.50491021929194e-05,
373
+ "loss": 1.8162,
374
+ "step": 215
375
+ },
376
+ {
377
+ "epoch": 7.333333333333333,
378
+ "grad_norm": 0.14658918976783752,
379
+ "learning_rate": 4.028414082972141e-05,
380
+ "loss": 1.8146,
381
+ "step": 220
382
+ },
383
+ {
384
+ "epoch": 7.5,
385
+ "grad_norm": 0.16408003866672516,
386
+ "learning_rate": 3.5721239031346066e-05,
387
+ "loss": 1.8179,
388
+ "step": 225
389
+ },
390
+ {
391
+ "epoch": 7.666666666666667,
392
+ "grad_norm": 0.15933865308761597,
393
+ "learning_rate": 3.137583621312665e-05,
394
+ "loss": 1.8101,
395
+ "step": 230
396
+ },
397
+ {
398
+ "epoch": 7.833333333333333,
399
+ "grad_norm": 0.14181320369243622,
400
+ "learning_rate": 2.7262635842695127e-05,
401
+ "loss": 1.8199,
402
+ "step": 235
403
+ },
404
+ {
405
+ "epoch": 8.0,
406
+ "grad_norm": 0.16157026588916779,
407
+ "learning_rate": 2.339555568810221e-05,
408
+ "loss": 1.8153,
409
+ "step": 240
410
+ },
411
+ {
412
+ "epoch": 8.0,
413
+ "eval_loss": 2.2560064792633057,
414
+ "eval_runtime": 0.8693,
415
+ "eval_samples_per_second": 2.301,
416
+ "eval_steps_per_second": 1.15,
417
+ "step": 240
418
+ },
419
+ {
420
+ "epoch": 8.166666666666666,
421
+ "grad_norm": 0.14123442769050598,
422
+ "learning_rate": 1.9787680724495617e-05,
423
+ "loss": 1.8138,
424
+ "step": 245
425
+ },
426
+ {
427
+ "epoch": 8.333333333333334,
428
+ "grad_norm": 0.14202268421649933,
429
+ "learning_rate": 1.6451218858706374e-05,
430
+ "loss": 1.8143,
431
+ "step": 250
432
+ },
433
+ {
434
+ "epoch": 8.5,
435
+ "grad_norm": 0.15732887387275696,
436
+ "learning_rate": 1.339745962155613e-05,
437
+ "loss": 1.811,
438
+ "step": 255
439
+ },
440
+ {
441
+ "epoch": 8.666666666666666,
442
+ "grad_norm": 0.13898594677448273,
443
+ "learning_rate": 1.0636735967658784e-05,
444
+ "loss": 1.8033,
445
+ "step": 260
446
+ },
447
+ {
448
+ "epoch": 8.833333333333334,
449
+ "grad_norm": 0.1528938263654709,
450
+ "learning_rate": 8.178389311972612e-06,
451
+ "loss": 1.8194,
452
+ "step": 265
453
+ },
454
+ {
455
+ "epoch": 9.0,
456
+ "grad_norm": 0.14544987678527832,
457
+ "learning_rate": 6.030737921409169e-06,
458
+ "loss": 1.8065,
459
+ "step": 270
460
+ },
461
+ {
462
+ "epoch": 9.0,
463
+ "eval_loss": 2.257399320602417,
464
+ "eval_runtime": 0.8685,
465
+ "eval_samples_per_second": 2.303,
466
+ "eval_steps_per_second": 1.151,
467
+ "step": 270
468
+ },
469
+ {
470
+ "epoch": 9.166666666666666,
471
+ "grad_norm": 0.13697919249534607,
472
+ "learning_rate": 4.20104876845111e-06,
473
+ "loss": 1.8037,
474
+ "step": 275
475
+ },
476
+ {
477
+ "epoch": 9.333333333333334,
478
+ "grad_norm": 0.134043887257576,
479
+ "learning_rate": 2.6955129420176196e-06,
480
+ "loss": 1.8095,
481
+ "step": 280
482
+ },
483
+ {
484
+ "epoch": 9.5,
485
+ "grad_norm": 0.1406584531068802,
486
+ "learning_rate": 1.5192246987791981e-06,
487
+ "loss": 1.8133,
488
+ "step": 285
489
+ },
490
+ {
491
+ "epoch": 9.666666666666666,
492
+ "grad_norm": 0.1350001096725464,
493
+ "learning_rate": 6.761642258056978e-07,
494
+ "loss": 1.8064,
495
+ "step": 290
496
+ },
497
+ {
498
+ "epoch": 9.833333333333334,
499
+ "grad_norm": 0.13039974868297577,
500
+ "learning_rate": 1.6918417287318245e-07,
501
+ "loss": 1.8157,
502
+ "step": 295
503
+ },
504
+ {
505
+ "epoch": 10.0,
506
+ "grad_norm": 0.12890292704105377,
507
+ "learning_rate": 0.0,
508
+ "loss": 1.8084,
509
+ "step": 300
510
+ },
511
+ {
512
+ "epoch": 10.0,
513
+ "eval_loss": 2.2579641342163086,
514
+ "eval_runtime": 0.8884,
515
+ "eval_samples_per_second": 2.251,
516
+ "eval_steps_per_second": 1.126,
517
+ "step": 300
518
+ },
519
+ {
520
+ "epoch": 10.0,
521
+ "step": 300,
522
+ "total_flos": 1.3343811847501906e+18,
523
+ "train_loss": 1.9245261510213216,
524
+ "train_runtime": 1578.2466,
525
+ "train_samples_per_second": 47.958,
526
+ "train_steps_per_second": 0.19
527
+ }
528
+ ],
529
+ "logging_steps": 5,
530
+ "max_steps": 300,
531
+ "num_input_tokens_seen": 0,
532
+ "num_train_epochs": 10,
533
+ "save_steps": 100,
534
+ "stateful_callbacks": {
535
+ "TrainerControl": {
536
+ "args": {
537
+ "should_epoch_stop": false,
538
+ "should_evaluate": false,
539
+ "should_log": false,
540
+ "should_save": true,
541
+ "should_training_stop": true
542
+ },
543
+ "attributes": {}
544
+ }
545
+ },
546
+ "total_flos": 1.3343811847501906e+18,
547
+ "train_batch_size": 16,
548
+ "trial_name": null,
549
+ "trial_params": null
550
+ }