ChiefTheLord commited on
Commit
e6b5fc6
verified
1 Parent(s): 0410d5d

Upload folder using huggingface_hub

Browse files
checkpoint-810/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac048c08c0be562ede9aa236e8c85c8cffe4f8653c8243696e417ee80a44de51
3
+ size 6020469052
checkpoint-810/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:116fdb28ffb5f13786af2d6ddfbcf542af5575a50a791aa846652c8635be179e
3
+ size 460681594
checkpoint-810/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0959e23e1122ccf0546cf64dcc6b1f11129516f4d5ec4bcbf3af19d588633b63
3
+ size 14308
checkpoint-810/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12902b5d135404c4085f65898232340add1dbf39fadba96cae7c04a980e3d2ef
3
+ size 1064
checkpoint-810/trainer_state.json ADDED
@@ -0,0 +1,459 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 4.0,
5
+ "eval_steps": 500,
6
+ "global_step": 810,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.07901234567901234,
13
+ "grad_norm": 1.3167223930358887,
14
+ "learning_rate": 3.1683168316831685e-06,
15
+ "loss": 8.0693,
16
+ "step": 16
17
+ },
18
+ {
19
+ "epoch": 0.1580246913580247,
20
+ "grad_norm": 1.3907195329666138,
21
+ "learning_rate": 6.336633663366337e-06,
22
+ "loss": 8.0604,
23
+ "step": 32
24
+ },
25
+ {
26
+ "epoch": 0.23703703703703705,
27
+ "grad_norm": 1.4589389562606812,
28
+ "learning_rate": 9.504950495049505e-06,
29
+ "loss": 8.014,
30
+ "step": 48
31
+ },
32
+ {
33
+ "epoch": 0.3160493827160494,
34
+ "grad_norm": 1.3790431022644043,
35
+ "learning_rate": 1.2673267326732674e-05,
36
+ "loss": 8.0335,
37
+ "step": 64
38
+ },
39
+ {
40
+ "epoch": 0.3950617283950617,
41
+ "grad_norm": 1.4199501276016235,
42
+ "learning_rate": 1.5841584158415843e-05,
43
+ "loss": 8.0402,
44
+ "step": 80
45
+ },
46
+ {
47
+ "epoch": 0.4740740740740741,
48
+ "grad_norm": 1.4907445907592773,
49
+ "learning_rate": 1.900990099009901e-05,
50
+ "loss": 8.0092,
51
+ "step": 96
52
+ },
53
+ {
54
+ "epoch": 0.5530864197530864,
55
+ "grad_norm": 1.5364277362823486,
56
+ "learning_rate": 1.999277438119978e-05,
57
+ "loss": 8.0424,
58
+ "step": 112
59
+ },
60
+ {
61
+ "epoch": 0.6320987654320988,
62
+ "grad_norm": 1.511999249458313,
63
+ "learning_rate": 1.995649347969019e-05,
64
+ "loss": 8.0508,
65
+ "step": 128
66
+ },
67
+ {
68
+ "epoch": 0.7111111111111111,
69
+ "grad_norm": 1.4422956705093384,
70
+ "learning_rate": 1.9889775168565942e-05,
71
+ "loss": 8.0349,
72
+ "step": 144
73
+ },
74
+ {
75
+ "epoch": 0.7901234567901234,
76
+ "grad_norm": 1.335660457611084,
77
+ "learning_rate": 1.9792823408445173e-05,
78
+ "loss": 8.0217,
79
+ "step": 160
80
+ },
81
+ {
82
+ "epoch": 0.8691358024691358,
83
+ "grad_norm": 1.4929100275039673,
84
+ "learning_rate": 1.966593458484168e-05,
85
+ "loss": 8.0288,
86
+ "step": 176
87
+ },
88
+ {
89
+ "epoch": 0.9481481481481482,
90
+ "grad_norm": 1.4001775979995728,
91
+ "learning_rate": 1.9509496602102253e-05,
92
+ "loss": 7.988,
93
+ "step": 192
94
+ },
95
+ {
96
+ "epoch": 0.9975308641975309,
97
+ "eval_bleu": 0.06908982865992942,
98
+ "eval_cap_loss": 2.4894158536312627,
99
+ "eval_con_loss": 2.0581045664992987,
100
+ "eval_loss": 6.605625005329356,
101
+ "step": 202
102
+ },
103
+ {
104
+ "epoch": 0.9975308641975309,
105
+ "eval_bleu": 0.06908982865992942,
106
+ "eval_cap_loss": 2.4894158536312627,
107
+ "eval_con_loss": 2.0581045664992987,
108
+ "eval_loss": 6.605625005329356,
109
+ "eval_runtime": 79.8535,
110
+ "eval_samples_per_second": 10.131,
111
+ "eval_steps_per_second": 1.277,
112
+ "step": 202
113
+ },
114
+ {
115
+ "epoch": 1.0271604938271606,
116
+ "grad_norm": 1.6348539590835571,
117
+ "learning_rate": 1.932398769756714e-05,
118
+ "loss": 7.9169,
119
+ "step": 208
120
+ },
121
+ {
122
+ "epoch": 1.106172839506173,
123
+ "grad_norm": 1.468509316444397,
124
+ "learning_rate": 1.9109974979578852e-05,
125
+ "loss": 7.9217,
126
+ "step": 224
127
+ },
128
+ {
129
+ "epoch": 1.1851851851851851,
130
+ "grad_norm": 1.390411376953125,
131
+ "learning_rate": 1.8868112693808664e-05,
132
+ "loss": 7.9302,
133
+ "step": 240
134
+ },
135
+ {
136
+ "epoch": 1.2641975308641975,
137
+ "grad_norm": 1.302958369255066,
138
+ "learning_rate": 1.8599140223200716e-05,
139
+ "loss": 7.968,
140
+ "step": 256
141
+ },
142
+ {
143
+ "epoch": 1.34320987654321,
144
+ "grad_norm": 1.465013861656189,
145
+ "learning_rate": 1.8303879827647977e-05,
146
+ "loss": 7.9389,
147
+ "step": 272
148
+ },
149
+ {
150
+ "epoch": 1.4222222222222223,
151
+ "grad_norm": 1.5777554512023926,
152
+ "learning_rate": 1.798323413030997e-05,
153
+ "loss": 7.9274,
154
+ "step": 288
155
+ },
156
+ {
157
+ "epoch": 1.5012345679012347,
158
+ "grad_norm": 1.3084310293197632,
159
+ "learning_rate": 1.76381833582567e-05,
160
+ "loss": 7.9272,
161
+ "step": 304
162
+ },
163
+ {
164
+ "epoch": 1.5802469135802468,
165
+ "grad_norm": 1.4037538766860962,
166
+ "learning_rate": 1.7269782345874204e-05,
167
+ "loss": 7.9136,
168
+ "step": 320
169
+ },
170
+ {
171
+ "epoch": 1.6592592592592592,
172
+ "grad_norm": 1.439072608947754,
173
+ "learning_rate": 1.6879157310192537e-05,
174
+ "loss": 7.9144,
175
+ "step": 336
176
+ },
177
+ {
178
+ "epoch": 1.7382716049382716,
179
+ "grad_norm": 1.4771238565444946,
180
+ "learning_rate": 1.6467502407993995e-05,
181
+ "loss": 7.9181,
182
+ "step": 352
183
+ },
184
+ {
185
+ "epoch": 1.817283950617284,
186
+ "grad_norm": 1.3906282186508179,
187
+ "learning_rate": 1.6036076085226813e-05,
188
+ "loss": 7.9348,
189
+ "step": 368
190
+ },
191
+ {
192
+ "epoch": 1.8962962962962964,
193
+ "grad_norm": 1.3744033575057983,
194
+ "learning_rate": 1.5586197229884185e-05,
195
+ "loss": 7.9507,
196
+ "step": 384
197
+ },
198
+ {
199
+ "epoch": 1.9753086419753085,
200
+ "grad_norm": 1.4868050813674927,
201
+ "learning_rate": 1.5119241140109466e-05,
202
+ "loss": 7.9345,
203
+ "step": 400
204
+ },
205
+ {
206
+ "epoch": 2.0,
207
+ "eval_bleu": 0.06595043107624254,
208
+ "eval_cap_loss": 2.4300825864660975,
209
+ "eval_con_loss": 2.058166358985153,
210
+ "eval_loss": 6.546415304436403,
211
+ "step": 405
212
+ },
213
+ {
214
+ "epoch": 2.0,
215
+ "eval_bleu": 0.06595043107624254,
216
+ "eval_cap_loss": 2.4300825864660975,
217
+ "eval_con_loss": 2.058166358985153,
218
+ "eval_loss": 6.546415304436403,
219
+ "eval_runtime": 79.4214,
220
+ "eval_samples_per_second": 10.186,
221
+ "eval_steps_per_second": 1.284,
222
+ "step": 405
223
+ },
224
+ {
225
+ "epoch": 2.054320987654321,
226
+ "grad_norm": 1.4153790473937988,
227
+ "learning_rate": 1.4636635319853274e-05,
228
+ "loss": 7.8805,
229
+ "step": 416
230
+ },
231
+ {
232
+ "epoch": 2.1333333333333333,
233
+ "grad_norm": 1.3386727571487427,
234
+ "learning_rate": 1.4139855114935253e-05,
235
+ "loss": 7.8973,
236
+ "step": 432
237
+ },
238
+ {
239
+ "epoch": 2.212345679012346,
240
+ "grad_norm": 1.4466866254806519,
241
+ "learning_rate": 1.3630419202851287e-05,
242
+ "loss": 7.9001,
243
+ "step": 448
244
+ },
245
+ {
246
+ "epoch": 2.291358024691358,
247
+ "grad_norm": 1.2998391389846802,
248
+ "learning_rate": 1.3109884950114007e-05,
249
+ "loss": 7.9096,
250
+ "step": 464
251
+ },
252
+ {
253
+ "epoch": 2.3703703703703702,
254
+ "grad_norm": 1.4705991744995117,
255
+ "learning_rate": 1.2579843651319382e-05,
256
+ "loss": 7.8773,
257
+ "step": 480
258
+ },
259
+ {
260
+ "epoch": 2.449382716049383,
261
+ "grad_norm": 1.3855006694793701,
262
+ "learning_rate": 1.2041915664493763e-05,
263
+ "loss": 7.8633,
264
+ "step": 496
265
+ },
266
+ {
267
+ "epoch": 2.528395061728395,
268
+ "grad_norm": 1.3997694253921509,
269
+ "learning_rate": 1.1497745457592817e-05,
270
+ "loss": 7.8597,
271
+ "step": 512
272
+ },
273
+ {
274
+ "epoch": 2.6074074074074076,
275
+ "grad_norm": 1.343112826347351,
276
+ "learning_rate": 1.0948996581295437e-05,
277
+ "loss": 7.8755,
278
+ "step": 528
279
+ },
280
+ {
281
+ "epoch": 2.68641975308642,
282
+ "grad_norm": 1.5445497035980225,
283
+ "learning_rate": 1.0397346583460972e-05,
284
+ "loss": 7.8874,
285
+ "step": 544
286
+ },
287
+ {
288
+ "epoch": 2.765432098765432,
289
+ "grad_norm": 1.466723918914795,
290
+ "learning_rate": 9.844481880796492e-06,
291
+ "loss": 7.9148,
292
+ "step": 560
293
+ },
294
+ {
295
+ "epoch": 2.8444444444444446,
296
+ "grad_norm": 1.3584568500518799,
297
+ "learning_rate": 9.292092603411642e-06,
298
+ "loss": 7.85,
299
+ "step": 576
300
+ },
301
+ {
302
+ "epoch": 2.9234567901234567,
303
+ "grad_norm": 1.4116945266723633,
304
+ "learning_rate": 8.741867428021447e-06,
305
+ "loss": 7.8632,
306
+ "step": 592
307
+ },
308
+ {
309
+ "epoch": 2.9975308641975307,
310
+ "eval_bleu": 0.0721711700240195,
311
+ "eval_cap_loss": 2.4210135878301133,
312
+ "eval_con_loss": 2.0581733035106287,
313
+ "eval_loss": 6.537360193682652,
314
+ "step": 607
315
+ },
316
+ {
317
+ "epoch": 2.9975308641975307,
318
+ "eval_bleu": 0.0721711700240195,
319
+ "eval_cap_loss": 2.4210135878301133,
320
+ "eval_con_loss": 2.0581733035106287,
321
+ "eval_loss": 6.537360193682652,
322
+ "eval_runtime": 81.5985,
323
+ "eval_samples_per_second": 9.914,
324
+ "eval_steps_per_second": 1.25,
325
+ "step": 607
326
+ },
327
+ {
328
+ "epoch": 3.0024691358024693,
329
+ "grad_norm": 1.864071249961853,
330
+ "learning_rate": 8.195488415592238e-06,
331
+ "loss": 7.8348,
332
+ "step": 608
333
+ },
334
+ {
335
+ "epoch": 3.0814814814814815,
336
+ "grad_norm": 1.5261914730072021,
337
+ "learning_rate": 7.654625869212147e-06,
338
+ "loss": 7.8239,
339
+ "step": 624
340
+ },
341
+ {
342
+ "epoch": 3.1604938271604937,
343
+ "grad_norm": 1.5600097179412842,
344
+ "learning_rate": 7.120933227905971e-06,
345
+ "loss": 7.8729,
346
+ "step": 640
347
+ },
348
+ {
349
+ "epoch": 3.2395061728395063,
350
+ "grad_norm": 1.521858811378479,
351
+ "learning_rate": 6.59604201200412e-06,
352
+ "loss": 7.8826,
353
+ "step": 656
354
+ },
355
+ {
356
+ "epoch": 3.3185185185185184,
357
+ "grad_norm": 1.453940749168396,
358
+ "learning_rate": 6.081556835517955e-06,
359
+ "loss": 7.8802,
360
+ "step": 672
361
+ },
362
+ {
363
+ "epoch": 3.397530864197531,
364
+ "grad_norm": 1.476491928100586,
365
+ "learning_rate": 5.579050500768837e-06,
366
+ "loss": 7.8213,
367
+ "step": 688
368
+ },
369
+ {
370
+ "epoch": 3.476543209876543,
371
+ "grad_norm": 1.4222908020019531,
372
+ "learning_rate": 5.090059190266779e-06,
373
+ "loss": 7.8651,
374
+ "step": 704
375
+ },
376
+ {
377
+ "epoch": 3.5555555555555554,
378
+ "grad_norm": 1.498777151107788,
379
+ "learning_rate": 4.616077770537453e-06,
380
+ "loss": 7.8873,
381
+ "step": 720
382
+ },
383
+ {
384
+ "epoch": 3.634567901234568,
385
+ "grad_norm": 1.4121512174606323,
386
+ "learning_rate": 4.158555222253772e-06,
387
+ "loss": 7.8258,
388
+ "step": 736
389
+ },
390
+ {
391
+ "epoch": 3.71358024691358,
392
+ "grad_norm": 1.5000683069229126,
393
+ "learning_rate": 3.718890210642442e-06,
394
+ "loss": 7.8599,
395
+ "step": 752
396
+ },
397
+ {
398
+ "epoch": 3.7925925925925927,
399
+ "grad_norm": 1.630094051361084,
400
+ "learning_rate": 3.2984268097069284e-06,
401
+ "loss": 7.8903,
402
+ "step": 768
403
+ },
404
+ {
405
+ "epoch": 3.871604938271605,
406
+ "grad_norm": 1.6297024488449097,
407
+ "learning_rate": 2.898450393337977e-06,
408
+ "loss": 7.8535,
409
+ "step": 784
410
+ },
411
+ {
412
+ "epoch": 3.950617283950617,
413
+ "grad_norm": 1.6124980449676514,
414
+ "learning_rate": 2.5201837058728506e-06,
415
+ "loss": 7.8592,
416
+ "step": 800
417
+ },
418
+ {
419
+ "epoch": 4.0,
420
+ "eval_bleu": 0.06875532122288064,
421
+ "eval_cap_loss": 2.4062162719520868,
422
+ "eval_con_loss": 2.0580918672038058,
423
+ "eval_loss": 6.522400014540729,
424
+ "step": 810
425
+ },
426
+ {
427
+ "epoch": 4.0,
428
+ "eval_bleu": 0.06875532122288064,
429
+ "eval_cap_loss": 2.4062162719520868,
430
+ "eval_con_loss": 2.0580918672038058,
431
+ "eval_loss": 6.522400014540729,
432
+ "eval_runtime": 82.4788,
433
+ "eval_samples_per_second": 9.809,
434
+ "eval_steps_per_second": 1.237,
435
+ "step": 810
436
+ }
437
+ ],
438
+ "logging_steps": 16,
439
+ "max_steps": 1010,
440
+ "num_input_tokens_seen": 0,
441
+ "num_train_epochs": 5,
442
+ "save_steps": 500,
443
+ "stateful_callbacks": {
444
+ "TrainerControl": {
445
+ "args": {
446
+ "should_epoch_stop": false,
447
+ "should_evaluate": false,
448
+ "should_log": false,
449
+ "should_save": true,
450
+ "should_training_stop": false
451
+ },
452
+ "attributes": {}
453
+ }
454
+ },
455
+ "total_flos": 0.0,
456
+ "train_batch_size": 16,
457
+ "trial_name": null,
458
+ "trial_params": null
459
+ }
checkpoint-810/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27dd50b12e8a8125c1ae97fdb89079032fd83cd34fcb149fee2dea4a9480ffdd
3
+ size 5176