Raihan004 commited on
Commit
6f44a33
·
1 Parent(s): bba2204

🍻 cheers

Browse files
README.md CHANGED
@@ -2,6 +2,7 @@
2
  license: apache-2.0
3
  base_model: google/vit-base-patch16-224-in21k
4
  tags:
 
5
  - generated_from_trainer
6
  datasets:
7
  - image_folder
@@ -14,7 +15,7 @@ model-index:
14
  name: Image Classification
15
  type: image-classification
16
  dataset:
17
- name: image_folder
18
  type: image_folder
19
  config: hierarchical-action-agent
20
  split: train
@@ -22,7 +23,7 @@ model-index:
22
  metrics:
23
  - name: Accuracy
24
  type: accuracy
25
- value: 0.8330935251798561
26
  ---
27
 
28
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -30,10 +31,10 @@ should probably proofread and complete it, then remove this comment. -->
30
 
31
  # Hierarchical_Agent_Action
32
 
33
- This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the image_folder dataset.
34
  It achieves the following results on the evaluation set:
35
- - Loss: 0.6262
36
- - Accuracy: 0.8331
37
 
38
  ## Model description
39
 
 
2
  license: apache-2.0
3
  base_model: google/vit-base-patch16-224-in21k
4
  tags:
5
+ - image-classification
6
  - generated_from_trainer
7
  datasets:
8
  - image_folder
 
15
  name: Image Classification
16
  type: image-classification
17
  dataset:
18
+ name: agent_action_class
19
  type: image_folder
20
  config: hierarchical-action-agent
21
  split: train
 
23
  metrics:
24
  - name: Accuracy
25
  type: accuracy
26
+ value: 0.8402877697841726
27
  ---
28
 
29
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
31
 
32
  # Hierarchical_Agent_Action
33
 
34
+ This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the agent_action_class dataset.
35
  It achieves the following results on the evaluation set:
36
+ - Loss: 0.5942
37
+ - Accuracy: 0.8403
38
 
39
  ## Model description
40
 
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 20.0,
3
- "eval_accuracy": 0.8215827338129497,
4
- "eval_loss": 0.6021825075149536,
5
- "eval_runtime": 12.5627,
6
- "eval_samples_per_second": 55.322,
7
- "eval_steps_per_second": 6.925,
8
- "total_flos": 6.103469271205233e+18,
9
- "train_loss": 0.6948060337574251,
10
- "train_runtime": 2796.6359,
11
- "train_samples_per_second": 28.155,
12
- "train_steps_per_second": 0.887
13
  }
 
1
  {
2
+ "epoch": 30.0,
3
+ "eval_accuracy": 0.8402877697841726,
4
+ "eval_loss": 0.5942012071609497,
5
+ "eval_runtime": 12.2205,
6
+ "eval_samples_per_second": 56.872,
7
+ "eval_steps_per_second": 7.119,
8
+ "total_flos": 9.155203906807849e+18,
9
+ "train_loss": 0.5316014153983003,
10
+ "train_runtime": 4018.8325,
11
+ "train_samples_per_second": 29.389,
12
+ "train_steps_per_second": 0.926
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 20.0,
3
- "eval_accuracy": 0.8215827338129497,
4
- "eval_loss": 0.6021825075149536,
5
- "eval_runtime": 12.5627,
6
- "eval_samples_per_second": 55.322,
7
- "eval_steps_per_second": 6.925
8
  }
 
1
  {
2
+ "epoch": 30.0,
3
+ "eval_accuracy": 0.8402877697841726,
4
+ "eval_loss": 0.5942012071609497,
5
+ "eval_runtime": 12.2205,
6
+ "eval_samples_per_second": 56.872,
7
+ "eval_steps_per_second": 7.119
8
  }
runs/Dec27_07-48-13_fc6b8939f5d8/events.out.tfevents.1703667338.fc6b8939f5d8.47.3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cccf719961c28b8e75b354a3fac564d928f8a8e2592a378edba4b45e26b9969
3
+ size 411
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 20.0,
3
- "total_flos": 6.103469271205233e+18,
4
- "train_loss": 0.6948060337574251,
5
- "train_runtime": 2796.6359,
6
- "train_samples_per_second": 28.155,
7
- "train_steps_per_second": 0.887
8
  }
 
1
  {
2
+ "epoch": 30.0,
3
+ "total_flos": 9.155203906807849e+18,
4
+ "train_loss": 0.5316014153983003,
5
+ "train_runtime": 4018.8325,
6
+ "train_samples_per_second": 29.389,
7
+ "train_steps_per_second": 0.926
8
  }
trainer_state.json CHANGED
@@ -1,988 +1,1477 @@
1
  {
2
- "best_metric": 0.6021825075149536,
3
- "best_model_checkpoint": "Hierarchical_Agent_Action/checkpoint-2200",
4
- "epoch": 20.0,
5
  "eval_steps": 100,
6
- "global_step": 2480,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.16,
13
- "learning_rate": 9.919354838709678e-05,
14
- "loss": 3.4603,
15
  "step": 20
16
  },
17
  {
18
  "epoch": 0.32,
19
- "learning_rate": 9.838709677419355e-05,
20
- "loss": 3.2338,
21
  "step": 40
22
  },
23
  {
24
  "epoch": 0.48,
25
- "learning_rate": 9.758064516129033e-05,
26
- "loss": 2.9478,
27
  "step": 60
28
  },
29
  {
30
  "epoch": 0.65,
31
- "learning_rate": 9.677419354838711e-05,
32
- "loss": 2.7149,
33
  "step": 80
34
  },
35
  {
36
  "epoch": 0.81,
37
- "learning_rate": 9.596774193548387e-05,
38
- "loss": 2.4733,
39
  "step": 100
40
  },
41
  {
42
  "epoch": 0.81,
43
- "eval_accuracy": 0.6172661870503597,
44
- "eval_loss": 2.2958028316497803,
45
- "eval_runtime": 12.3246,
46
- "eval_samples_per_second": 56.391,
47
- "eval_steps_per_second": 7.059,
48
  "step": 100
49
  },
50
  {
51
  "epoch": 0.97,
52
- "learning_rate": 9.516129032258065e-05,
53
- "loss": 2.308,
54
  "step": 120
55
  },
56
  {
57
  "epoch": 1.13,
58
- "learning_rate": 9.439516129032259e-05,
59
- "loss": 2.0956,
60
  "step": 140
61
  },
62
  {
63
  "epoch": 1.29,
64
- "learning_rate": 9.358870967741936e-05,
65
- "loss": 2.0524,
66
  "step": 160
67
  },
68
  {
69
  "epoch": 1.45,
70
- "learning_rate": 9.278225806451613e-05,
71
- "loss": 1.8984,
72
  "step": 180
73
  },
74
  {
75
  "epoch": 1.61,
76
- "learning_rate": 9.197580645161291e-05,
77
- "loss": 1.7952,
78
  "step": 200
79
  },
80
  {
81
  "epoch": 1.61,
82
- "eval_accuracy": 0.7251798561151079,
83
- "eval_loss": 1.6266770362854004,
84
- "eval_runtime": 12.444,
85
- "eval_samples_per_second": 55.85,
86
- "eval_steps_per_second": 6.991,
87
  "step": 200
88
  },
89
  {
90
  "epoch": 1.77,
91
- "learning_rate": 9.116935483870969e-05,
92
- "loss": 1.7496,
93
  "step": 220
94
  },
95
  {
96
  "epoch": 1.94,
97
- "learning_rate": 9.036290322580646e-05,
98
- "loss": 1.5949,
99
  "step": 240
100
  },
101
  {
102
  "epoch": 2.1,
103
- "learning_rate": 8.955645161290322e-05,
104
- "loss": 1.4567,
105
  "step": 260
106
  },
107
  {
108
  "epoch": 2.26,
109
- "learning_rate": 8.875e-05,
110
- "loss": 1.4255,
111
  "step": 280
112
  },
113
  {
114
  "epoch": 2.42,
115
- "learning_rate": 8.794354838709677e-05,
116
- "loss": 1.409,
117
  "step": 300
118
  },
119
  {
120
  "epoch": 2.42,
121
- "eval_accuracy": 0.7525179856115108,
122
- "eval_loss": 1.2853118181228638,
123
- "eval_runtime": 12.6438,
124
- "eval_samples_per_second": 54.968,
125
- "eval_steps_per_second": 6.881,
126
  "step": 300
127
  },
128
  {
129
  "epoch": 2.58,
130
- "learning_rate": 8.713709677419355e-05,
131
- "loss": 1.3729,
132
  "step": 320
133
  },
134
  {
135
  "epoch": 2.74,
136
- "learning_rate": 8.633064516129033e-05,
137
- "loss": 1.3318,
138
  "step": 340
139
  },
140
  {
141
  "epoch": 2.9,
142
- "learning_rate": 8.55241935483871e-05,
143
- "loss": 1.246,
144
  "step": 360
145
  },
146
  {
147
  "epoch": 3.06,
148
- "learning_rate": 8.471774193548387e-05,
149
- "loss": 1.1057,
150
  "step": 380
151
  },
152
  {
153
  "epoch": 3.23,
154
- "learning_rate": 8.391129032258065e-05,
155
- "loss": 1.1321,
156
  "step": 400
157
  },
158
  {
159
  "epoch": 3.23,
160
- "eval_accuracy": 0.7971223021582734,
161
- "eval_loss": 0.9899287819862366,
162
- "eval_runtime": 12.5295,
163
- "eval_samples_per_second": 55.469,
164
- "eval_steps_per_second": 6.944,
165
  "step": 400
166
  },
167
  {
168
  "epoch": 3.39,
169
- "learning_rate": 8.310483870967743e-05,
170
- "loss": 1.0894,
171
  "step": 420
172
  },
173
  {
174
  "epoch": 3.55,
175
- "learning_rate": 8.22983870967742e-05,
176
- "loss": 1.0102,
177
  "step": 440
178
  },
179
  {
180
  "epoch": 3.71,
181
- "learning_rate": 8.149193548387097e-05,
182
- "loss": 1.0241,
183
  "step": 460
184
  },
185
  {
186
  "epoch": 3.87,
187
- "learning_rate": 8.068548387096775e-05,
188
- "loss": 0.9724,
189
  "step": 480
190
  },
191
  {
192
  "epoch": 4.03,
193
- "learning_rate": 7.987903225806452e-05,
194
- "loss": 0.9094,
195
  "step": 500
196
  },
197
  {
198
  "epoch": 4.03,
199
- "eval_accuracy": 0.8028776978417266,
200
- "eval_loss": 0.8827974200248718,
201
- "eval_runtime": 12.471,
202
- "eval_samples_per_second": 55.729,
203
- "eval_steps_per_second": 6.976,
204
  "step": 500
205
  },
206
  {
207
  "epoch": 4.19,
208
- "learning_rate": 7.90725806451613e-05,
209
- "loss": 0.8774,
210
  "step": 520
211
  },
212
  {
213
  "epoch": 4.35,
214
- "learning_rate": 7.826612903225808e-05,
215
- "loss": 0.8719,
216
  "step": 540
217
  },
218
  {
219
  "epoch": 4.52,
220
- "learning_rate": 7.745967741935484e-05,
221
- "loss": 0.9112,
222
  "step": 560
223
  },
224
  {
225
  "epoch": 4.68,
226
- "learning_rate": 7.665322580645161e-05,
227
- "loss": 0.8573,
228
  "step": 580
229
  },
230
  {
231
  "epoch": 4.84,
232
- "learning_rate": 7.584677419354839e-05,
233
- "loss": 0.8552,
234
  "step": 600
235
  },
236
  {
237
  "epoch": 4.84,
238
- "eval_accuracy": 0.7884892086330936,
239
- "eval_loss": 0.8248726725578308,
240
- "eval_runtime": 12.3792,
241
- "eval_samples_per_second": 56.143,
242
- "eval_steps_per_second": 7.028,
243
  "step": 600
244
  },
245
  {
246
  "epoch": 5.0,
247
- "learning_rate": 7.504032258064517e-05,
248
- "loss": 0.8021,
249
  "step": 620
250
  },
251
  {
252
  "epoch": 5.16,
253
- "learning_rate": 7.423387096774193e-05,
254
- "loss": 0.7215,
255
  "step": 640
256
  },
257
  {
258
  "epoch": 5.32,
259
- "learning_rate": 7.342741935483871e-05,
260
- "loss": 0.6933,
261
  "step": 660
262
  },
263
  {
264
  "epoch": 5.48,
265
- "learning_rate": 7.262096774193549e-05,
266
- "loss": 0.7272,
267
  "step": 680
268
  },
269
  {
270
  "epoch": 5.65,
271
- "learning_rate": 7.181451612903225e-05,
272
- "loss": 0.7082,
273
  "step": 700
274
  },
275
  {
276
  "epoch": 5.65,
277
- "eval_accuracy": 0.7755395683453238,
278
- "eval_loss": 0.8111026287078857,
279
- "eval_runtime": 12.4288,
280
- "eval_samples_per_second": 55.919,
281
- "eval_steps_per_second": 7.0,
282
  "step": 700
283
  },
284
  {
285
  "epoch": 5.81,
286
- "learning_rate": 7.100806451612903e-05,
287
- "loss": 0.6704,
288
  "step": 720
289
  },
290
  {
291
  "epoch": 5.97,
292
- "learning_rate": 7.020161290322581e-05,
293
- "loss": 0.6963,
294
  "step": 740
295
  },
296
  {
297
  "epoch": 6.13,
298
- "learning_rate": 6.939516129032259e-05,
299
- "loss": 0.6072,
300
  "step": 760
301
  },
302
  {
303
  "epoch": 6.29,
304
- "learning_rate": 6.858870967741936e-05,
305
- "loss": 0.65,
306
  "step": 780
307
  },
308
  {
309
  "epoch": 6.45,
310
- "learning_rate": 6.778225806451614e-05,
311
- "loss": 0.5812,
312
  "step": 800
313
  },
314
  {
315
  "epoch": 6.45,
316
- "eval_accuracy": 0.8115107913669065,
317
- "eval_loss": 0.7086611986160278,
318
- "eval_runtime": 12.3594,
319
- "eval_samples_per_second": 56.232,
320
- "eval_steps_per_second": 7.039,
321
  "step": 800
322
  },
323
  {
324
  "epoch": 6.61,
325
- "learning_rate": 6.697580645161292e-05,
326
- "loss": 0.6067,
327
  "step": 820
328
  },
329
  {
330
  "epoch": 6.77,
331
- "learning_rate": 6.616935483870968e-05,
332
- "loss": 0.562,
333
  "step": 840
334
  },
335
  {
336
  "epoch": 6.94,
337
- "learning_rate": 6.536290322580646e-05,
338
- "loss": 0.5473,
339
  "step": 860
340
  },
341
  {
342
  "epoch": 7.1,
343
- "learning_rate": 6.455645161290323e-05,
344
- "loss": 0.6343,
345
  "step": 880
346
  },
347
  {
348
  "epoch": 7.26,
349
- "learning_rate": 6.375e-05,
350
- "loss": 0.5714,
351
  "step": 900
352
  },
353
  {
354
  "epoch": 7.26,
355
- "eval_accuracy": 0.8158273381294964,
356
- "eval_loss": 0.6596143245697021,
357
- "eval_runtime": 12.6005,
358
- "eval_samples_per_second": 55.156,
359
- "eval_steps_per_second": 6.904,
360
  "step": 900
361
  },
362
  {
363
  "epoch": 7.42,
364
- "learning_rate": 6.294354838709677e-05,
365
- "loss": 0.4871,
366
  "step": 920
367
  },
368
  {
369
  "epoch": 7.58,
370
- "learning_rate": 6.213709677419355e-05,
371
- "loss": 0.5548,
372
  "step": 940
373
  },
374
  {
375
  "epoch": 7.74,
376
- "learning_rate": 6.133064516129033e-05,
377
- "loss": 0.6063,
378
  "step": 960
379
  },
380
  {
381
  "epoch": 7.9,
382
- "learning_rate": 6.05241935483871e-05,
383
- "loss": 0.5082,
384
  "step": 980
385
  },
386
  {
387
  "epoch": 8.06,
388
- "learning_rate": 5.971774193548387e-05,
389
- "loss": 0.4802,
390
  "step": 1000
391
  },
392
  {
393
  "epoch": 8.06,
394
  "eval_accuracy": 0.781294964028777,
395
- "eval_loss": 0.7486134767532349,
396
- "eval_runtime": 12.6896,
397
- "eval_samples_per_second": 54.769,
398
- "eval_steps_per_second": 6.856,
399
  "step": 1000
400
  },
401
  {
402
  "epoch": 8.23,
403
- "learning_rate": 5.8911290322580645e-05,
404
- "loss": 0.4552,
405
  "step": 1020
406
  },
407
  {
408
  "epoch": 8.39,
409
- "learning_rate": 5.8104838709677424e-05,
410
- "loss": 0.5083,
411
  "step": 1040
412
  },
413
  {
414
  "epoch": 8.55,
415
- "learning_rate": 5.7298387096774197e-05,
416
- "loss": 0.4695,
417
  "step": 1060
418
  },
419
  {
420
  "epoch": 8.71,
421
- "learning_rate": 5.6491935483870976e-05,
422
- "loss": 0.5316,
423
  "step": 1080
424
  },
425
  {
426
  "epoch": 8.87,
427
- "learning_rate": 5.568548387096775e-05,
428
- "loss": 0.5135,
429
  "step": 1100
430
  },
431
  {
432
  "epoch": 8.87,
433
- "eval_accuracy": 0.8258992805755395,
434
- "eval_loss": 0.6208385229110718,
435
- "eval_runtime": 12.6127,
436
- "eval_samples_per_second": 55.103,
437
- "eval_steps_per_second": 6.898,
438
  "step": 1100
439
  },
440
  {
441
  "epoch": 9.03,
442
- "learning_rate": 5.491935483870968e-05,
443
- "loss": 0.4198,
444
  "step": 1120
445
  },
446
  {
447
  "epoch": 9.19,
448
- "learning_rate": 5.411290322580646e-05,
449
- "loss": 0.4311,
450
  "step": 1140
451
  },
452
  {
453
  "epoch": 9.35,
454
- "learning_rate": 5.330645161290323e-05,
455
- "loss": 0.467,
456
  "step": 1160
457
  },
458
  {
459
  "epoch": 9.52,
460
- "learning_rate": 5.25e-05,
461
- "loss": 0.4832,
462
  "step": 1180
463
  },
464
  {
465
  "epoch": 9.68,
466
- "learning_rate": 5.169354838709678e-05,
467
- "loss": 0.4724,
468
  "step": 1200
469
  },
470
  {
471
  "epoch": 9.68,
472
- "eval_accuracy": 0.8129496402877698,
473
- "eval_loss": 0.6434023976325989,
474
- "eval_runtime": 12.4712,
475
- "eval_samples_per_second": 55.729,
476
- "eval_steps_per_second": 6.976,
477
  "step": 1200
478
  },
479
  {
480
  "epoch": 9.84,
481
- "learning_rate": 5.0887096774193554e-05,
482
- "loss": 0.4715,
483
  "step": 1220
484
  },
485
  {
486
  "epoch": 10.0,
487
- "learning_rate": 5.008064516129033e-05,
488
- "loss": 0.4828,
489
  "step": 1240
490
  },
491
  {
492
  "epoch": 10.16,
493
- "learning_rate": 4.92741935483871e-05,
494
- "loss": 0.4171,
495
  "step": 1260
496
  },
497
  {
498
  "epoch": 10.32,
499
- "learning_rate": 4.846774193548387e-05,
500
- "loss": 0.4482,
501
  "step": 1280
502
  },
503
  {
504
  "epoch": 10.48,
505
- "learning_rate": 4.766129032258065e-05,
506
- "loss": 0.3781,
507
  "step": 1300
508
  },
509
  {
510
  "epoch": 10.48,
511
- "eval_accuracy": 0.818705035971223,
512
- "eval_loss": 0.6289911866188049,
513
- "eval_runtime": 12.5799,
514
- "eval_samples_per_second": 55.247,
515
- "eval_steps_per_second": 6.916,
516
  "step": 1300
517
  },
518
  {
519
  "epoch": 10.65,
520
- "learning_rate": 4.685483870967742e-05,
521
- "loss": 0.4689,
522
  "step": 1320
523
  },
524
  {
525
  "epoch": 10.81,
526
- "learning_rate": 4.60483870967742e-05,
527
- "loss": 0.379,
528
  "step": 1340
529
  },
530
  {
531
  "epoch": 10.97,
532
- "learning_rate": 4.5241935483870966e-05,
533
- "loss": 0.4008,
534
  "step": 1360
535
  },
536
  {
537
  "epoch": 11.13,
538
- "learning_rate": 4.4435483870967745e-05,
539
- "loss": 0.3681,
540
  "step": 1380
541
  },
542
  {
543
  "epoch": 11.29,
544
- "learning_rate": 4.362903225806452e-05,
545
- "loss": 0.457,
546
  "step": 1400
547
  },
548
  {
549
  "epoch": 11.29,
550
- "eval_accuracy": 0.8345323741007195,
551
- "eval_loss": 0.6152310967445374,
552
- "eval_runtime": 12.4473,
553
- "eval_samples_per_second": 55.836,
554
- "eval_steps_per_second": 6.989,
555
  "step": 1400
556
  },
557
  {
558
  "epoch": 11.45,
559
- "learning_rate": 4.282258064516129e-05,
560
- "loss": 0.4215,
561
  "step": 1420
562
  },
563
  {
564
  "epoch": 11.61,
565
- "learning_rate": 4.201612903225807e-05,
566
- "loss": 0.3423,
567
  "step": 1440
568
  },
569
  {
570
  "epoch": 11.77,
571
- "learning_rate": 4.120967741935484e-05,
572
- "loss": 0.3994,
573
  "step": 1460
574
  },
575
  {
576
  "epoch": 11.94,
577
- "learning_rate": 4.0403225806451614e-05,
578
- "loss": 0.3745,
579
  "step": 1480
580
  },
581
  {
582
  "epoch": 12.1,
583
- "learning_rate": 3.959677419354839e-05,
584
- "loss": 0.4174,
585
  "step": 1500
586
  },
587
  {
588
  "epoch": 12.1,
589
- "eval_accuracy": 0.814388489208633,
590
- "eval_loss": 0.6348451375961304,
591
- "eval_runtime": 12.3076,
592
- "eval_samples_per_second": 56.469,
593
- "eval_steps_per_second": 7.069,
594
  "step": 1500
595
  },
596
  {
597
  "epoch": 12.26,
598
- "learning_rate": 3.879032258064516e-05,
599
- "loss": 0.3009,
600
  "step": 1520
601
  },
602
  {
603
  "epoch": 12.42,
604
- "learning_rate": 3.798387096774194e-05,
605
- "loss": 0.3015,
606
  "step": 1540
607
  },
608
  {
609
  "epoch": 12.58,
610
- "learning_rate": 3.717741935483871e-05,
611
- "loss": 0.3543,
612
  "step": 1560
613
  },
614
  {
615
  "epoch": 12.74,
616
- "learning_rate": 3.637096774193549e-05,
617
- "loss": 0.3493,
618
  "step": 1580
619
  },
620
  {
621
  "epoch": 12.9,
622
- "learning_rate": 3.556451612903226e-05,
623
- "loss": 0.3778,
624
  "step": 1600
625
  },
626
  {
627
  "epoch": 12.9,
628
- "eval_accuracy": 0.8158273381294964,
629
- "eval_loss": 0.6418473720550537,
630
- "eval_runtime": 12.4779,
631
- "eval_samples_per_second": 55.699,
632
- "eval_steps_per_second": 6.972,
633
  "step": 1600
634
  },
635
  {
636
  "epoch": 13.06,
637
- "learning_rate": 3.475806451612903e-05,
638
- "loss": 0.3284,
639
  "step": 1620
640
  },
641
  {
642
  "epoch": 13.23,
643
- "learning_rate": 3.395161290322581e-05,
644
- "loss": 0.3104,
645
  "step": 1640
646
  },
647
  {
648
  "epoch": 13.39,
649
- "learning_rate": 3.3145161290322585e-05,
650
- "loss": 0.3097,
651
  "step": 1660
652
  },
653
  {
654
  "epoch": 13.55,
655
- "learning_rate": 3.233870967741936e-05,
656
- "loss": 0.3491,
657
  "step": 1680
658
  },
659
  {
660
  "epoch": 13.71,
661
- "learning_rate": 3.153225806451613e-05,
662
- "loss": 0.3391,
663
  "step": 1700
664
  },
665
  {
666
  "epoch": 13.71,
667
- "eval_accuracy": 0.8172661870503597,
668
- "eval_loss": 0.6456794738769531,
669
- "eval_runtime": 12.5081,
670
- "eval_samples_per_second": 55.564,
671
- "eval_steps_per_second": 6.955,
672
  "step": 1700
673
  },
674
  {
675
  "epoch": 13.87,
676
- "learning_rate": 3.076612903225807e-05,
677
- "loss": 0.3287,
678
  "step": 1720
679
  },
680
  {
681
  "epoch": 14.03,
682
- "learning_rate": 2.9959677419354842e-05,
683
- "loss": 0.3154,
684
  "step": 1740
685
  },
686
  {
687
  "epoch": 14.19,
688
- "learning_rate": 2.915322580645161e-05,
689
- "loss": 0.3297,
690
  "step": 1760
691
  },
692
  {
693
  "epoch": 14.35,
694
- "learning_rate": 2.8346774193548387e-05,
695
- "loss": 0.2665,
696
  "step": 1780
697
  },
698
  {
699
  "epoch": 14.52,
700
- "learning_rate": 2.7540322580645163e-05,
701
- "loss": 0.309,
702
  "step": 1800
703
  },
704
  {
705
  "epoch": 14.52,
706
- "eval_accuracy": 0.8100719424460432,
707
- "eval_loss": 0.6428855657577515,
708
- "eval_runtime": 12.5753,
709
- "eval_samples_per_second": 55.267,
710
- "eval_steps_per_second": 6.918,
711
  "step": 1800
712
  },
713
  {
714
  "epoch": 14.68,
715
- "learning_rate": 2.6733870967741935e-05,
716
- "loss": 0.3145,
717
  "step": 1820
718
  },
719
  {
720
  "epoch": 14.84,
721
- "learning_rate": 2.592741935483871e-05,
722
- "loss": 0.3275,
723
  "step": 1840
724
  },
725
  {
726
  "epoch": 15.0,
727
- "learning_rate": 2.5120967741935486e-05,
728
- "loss": 0.3119,
729
  "step": 1860
730
  },
731
  {
732
  "epoch": 15.16,
733
- "learning_rate": 2.431451612903226e-05,
734
- "loss": 0.2758,
735
  "step": 1880
736
  },
737
  {
738
  "epoch": 15.32,
739
- "learning_rate": 2.3508064516129034e-05,
740
- "loss": 0.3543,
741
  "step": 1900
742
  },
743
  {
744
  "epoch": 15.32,
745
- "eval_accuracy": 0.8201438848920863,
746
- "eval_loss": 0.6085776090621948,
747
- "eval_runtime": 12.4853,
748
- "eval_samples_per_second": 55.665,
749
- "eval_steps_per_second": 6.968,
750
  "step": 1900
751
  },
752
  {
753
  "epoch": 15.48,
754
- "learning_rate": 2.2701612903225807e-05,
755
- "loss": 0.2298,
756
  "step": 1920
757
  },
758
  {
759
  "epoch": 15.65,
760
- "learning_rate": 2.1895161290322582e-05,
761
- "loss": 0.3242,
762
  "step": 1940
763
  },
764
  {
765
  "epoch": 15.81,
766
- "learning_rate": 2.1088709677419355e-05,
767
- "loss": 0.2483,
768
  "step": 1960
769
  },
770
  {
771
  "epoch": 15.97,
772
- "learning_rate": 2.028225806451613e-05,
773
- "loss": 0.2685,
774
  "step": 1980
775
  },
776
  {
777
  "epoch": 16.13,
778
- "learning_rate": 1.9475806451612906e-05,
779
- "loss": 0.3232,
780
  "step": 2000
781
  },
782
  {
783
  "epoch": 16.13,
784
- "eval_accuracy": 0.8244604316546763,
785
- "eval_loss": 0.6374786496162415,
786
- "eval_runtime": 12.4055,
787
- "eval_samples_per_second": 56.024,
788
- "eval_steps_per_second": 7.013,
789
  "step": 2000
790
  },
791
  {
792
  "epoch": 16.29,
793
- "learning_rate": 1.8669354838709678e-05,
794
- "loss": 0.2504,
795
  "step": 2020
796
  },
797
  {
798
  "epoch": 16.45,
799
- "learning_rate": 1.786290322580645e-05,
800
- "loss": 0.3025,
801
  "step": 2040
802
  },
803
  {
804
  "epoch": 16.61,
805
- "learning_rate": 1.7056451612903226e-05,
806
- "loss": 0.3046,
807
  "step": 2060
808
  },
809
  {
810
  "epoch": 16.77,
811
- "learning_rate": 1.6250000000000002e-05,
812
- "loss": 0.3086,
813
  "step": 2080
814
  },
815
  {
816
  "epoch": 16.94,
817
- "learning_rate": 1.5443548387096778e-05,
818
- "loss": 0.2687,
819
  "step": 2100
820
  },
821
  {
822
  "epoch": 16.94,
823
- "eval_accuracy": 0.8273381294964028,
824
- "eval_loss": 0.6281846165657043,
825
- "eval_runtime": 12.5909,
826
- "eval_samples_per_second": 55.199,
827
- "eval_steps_per_second": 6.91,
828
  "step": 2100
829
  },
830
  {
831
  "epoch": 17.1,
832
- "learning_rate": 1.4637096774193548e-05,
833
- "loss": 0.3014,
834
  "step": 2120
835
  },
836
  {
837
  "epoch": 17.26,
838
- "learning_rate": 1.3830645161290324e-05,
839
- "loss": 0.2911,
840
  "step": 2140
841
  },
842
  {
843
  "epoch": 17.42,
844
- "learning_rate": 1.3024193548387098e-05,
845
- "loss": 0.262,
846
  "step": 2160
847
  },
848
  {
849
  "epoch": 17.58,
850
- "learning_rate": 1.2217741935483872e-05,
851
- "loss": 0.2739,
852
  "step": 2180
853
  },
854
  {
855
  "epoch": 17.74,
856
- "learning_rate": 1.1411290322580646e-05,
857
- "loss": 0.2551,
858
  "step": 2200
859
  },
860
  {
861
  "epoch": 17.74,
862
- "eval_accuracy": 0.8215827338129497,
863
- "eval_loss": 0.6021825075149536,
864
- "eval_runtime": 12.4226,
865
- "eval_samples_per_second": 55.946,
866
- "eval_steps_per_second": 7.003,
867
  "step": 2200
868
  },
869
  {
870
  "epoch": 17.9,
871
- "learning_rate": 1.060483870967742e-05,
872
- "loss": 0.3001,
873
  "step": 2220
874
  },
875
  {
876
  "epoch": 18.06,
877
- "learning_rate": 9.798387096774194e-06,
878
- "loss": 0.5205,
879
  "step": 2240
880
  },
881
  {
882
  "epoch": 18.23,
883
- "learning_rate": 8.991935483870968e-06,
884
- "loss": 0.285,
885
  "step": 2260
886
  },
887
  {
888
  "epoch": 18.39,
889
- "learning_rate": 8.185483870967742e-06,
890
- "loss": 0.2684,
891
  "step": 2280
892
  },
893
  {
894
  "epoch": 18.55,
895
- "learning_rate": 7.379032258064517e-06,
896
- "loss": 0.2711,
897
  "step": 2300
898
  },
899
  {
900
  "epoch": 18.55,
901
- "eval_accuracy": 0.818705035971223,
902
- "eval_loss": 0.6055982708930969,
903
- "eval_runtime": 12.4967,
904
- "eval_samples_per_second": 55.615,
905
- "eval_steps_per_second": 6.962,
906
  "step": 2300
907
  },
908
  {
909
  "epoch": 18.71,
910
- "learning_rate": 6.57258064516129e-06,
911
- "loss": 0.2776,
912
  "step": 2320
913
  },
914
  {
915
  "epoch": 18.87,
916
- "learning_rate": 5.766129032258065e-06,
917
- "loss": 0.2575,
918
  "step": 2340
919
  },
920
  {
921
  "epoch": 19.03,
922
- "learning_rate": 4.9596774193548395e-06,
923
- "loss": 0.2816,
924
  "step": 2360
925
  },
926
  {
927
  "epoch": 19.19,
928
- "learning_rate": 4.1532258064516135e-06,
929
- "loss": 0.2369,
930
  "step": 2380
931
  },
932
  {
933
  "epoch": 19.35,
934
- "learning_rate": 3.3467741935483875e-06,
935
- "loss": 0.2371,
936
  "step": 2400
937
  },
938
  {
939
  "epoch": 19.35,
940
- "eval_accuracy": 0.823021582733813,
941
- "eval_loss": 0.6057971715927124,
942
- "eval_runtime": 12.6132,
943
- "eval_samples_per_second": 55.101,
944
- "eval_steps_per_second": 6.898,
945
  "step": 2400
946
  },
947
  {
948
  "epoch": 19.52,
949
- "learning_rate": 2.5403225806451615e-06,
950
- "loss": 0.2299,
951
  "step": 2420
952
  },
953
  {
954
  "epoch": 19.68,
955
- "learning_rate": 1.7338709677419355e-06,
956
- "loss": 0.203,
957
  "step": 2440
958
  },
959
  {
960
  "epoch": 19.84,
961
- "learning_rate": 9.274193548387097e-07,
962
- "loss": 0.301,
963
  "step": 2460
964
  },
965
  {
966
  "epoch": 20.0,
967
- "learning_rate": 1.2096774193548387e-07,
968
- "loss": 0.2444,
969
  "step": 2480
970
  },
971
  {
972
- "epoch": 20.0,
973
- "step": 2480,
974
- "total_flos": 6.103469271205233e+18,
975
- "train_loss": 0.6948060337574251,
976
- "train_runtime": 2796.6359,
977
- "train_samples_per_second": 28.155,
978
- "train_steps_per_second": 0.887
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
979
  }
980
  ],
981
  "logging_steps": 20,
982
- "max_steps": 2480,
983
- "num_train_epochs": 20,
984
  "save_steps": 100,
985
- "total_flos": 6.103469271205233e+18,
986
  "trial_name": null,
987
  "trial_params": null
988
  }
 
1
  {
2
+ "best_metric": 0.5942012071609497,
3
+ "best_model_checkpoint": "Hierarchical_Agent_Action/checkpoint-3100",
4
+ "epoch": 30.0,
5
  "eval_steps": 100,
6
+ "global_step": 3720,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.16,
13
+ "learning_rate": 9.946236559139786e-05,
14
+ "loss": 3.4378,
15
  "step": 20
16
  },
17
  {
18
  "epoch": 0.32,
19
+ "learning_rate": 9.892473118279571e-05,
20
+ "loss": 3.1863,
21
  "step": 40
22
  },
23
  {
24
  "epoch": 0.48,
25
+ "learning_rate": 9.838709677419355e-05,
26
+ "loss": 2.9075,
27
  "step": 60
28
  },
29
  {
30
  "epoch": 0.65,
31
+ "learning_rate": 9.78494623655914e-05,
32
+ "loss": 2.6661,
33
  "step": 80
34
  },
35
  {
36
  "epoch": 0.81,
37
+ "learning_rate": 9.731182795698925e-05,
38
+ "loss": 2.4407,
39
  "step": 100
40
  },
41
  {
42
  "epoch": 0.81,
43
+ "eval_accuracy": 0.6057553956834533,
44
+ "eval_loss": 2.271561622619629,
45
+ "eval_runtime": 12.2022,
46
+ "eval_samples_per_second": 56.957,
47
+ "eval_steps_per_second": 7.13,
48
  "step": 100
49
  },
50
  {
51
  "epoch": 0.97,
52
+ "learning_rate": 9.677419354838711e-05,
53
+ "loss": 2.2877,
54
  "step": 120
55
  },
56
  {
57
  "epoch": 1.13,
58
+ "learning_rate": 9.626344086021506e-05,
59
+ "loss": 2.0861,
60
  "step": 140
61
  },
62
  {
63
  "epoch": 1.29,
64
+ "learning_rate": 9.572580645161291e-05,
65
+ "loss": 2.0179,
66
  "step": 160
67
  },
68
  {
69
  "epoch": 1.45,
70
+ "learning_rate": 9.518817204301076e-05,
71
+ "loss": 1.8839,
72
  "step": 180
73
  },
74
  {
75
  "epoch": 1.61,
76
+ "learning_rate": 9.465053763440861e-05,
77
+ "loss": 1.7756,
78
  "step": 200
79
  },
80
  {
81
  "epoch": 1.61,
82
+ "eval_accuracy": 0.7064748201438849,
83
+ "eval_loss": 1.6162313222885132,
84
+ "eval_runtime": 13.1922,
85
+ "eval_samples_per_second": 52.683,
86
+ "eval_steps_per_second": 6.595,
87
  "step": 200
88
  },
89
  {
90
  "epoch": 1.77,
91
+ "learning_rate": 9.411290322580645e-05,
92
+ "loss": 1.7271,
93
  "step": 220
94
  },
95
  {
96
  "epoch": 1.94,
97
+ "learning_rate": 9.357526881720431e-05,
98
+ "loss": 1.5882,
99
  "step": 240
100
  },
101
  {
102
  "epoch": 2.1,
103
+ "learning_rate": 9.303763440860216e-05,
104
+ "loss": 1.4613,
105
  "step": 260
106
  },
107
  {
108
  "epoch": 2.26,
109
+ "learning_rate": 9.250000000000001e-05,
110
+ "loss": 1.4197,
111
  "step": 280
112
  },
113
  {
114
  "epoch": 2.42,
115
+ "learning_rate": 9.196236559139786e-05,
116
+ "loss": 1.3948,
117
  "step": 300
118
  },
119
  {
120
  "epoch": 2.42,
121
+ "eval_accuracy": 0.7697841726618705,
122
+ "eval_loss": 1.2199510335922241,
123
+ "eval_runtime": 12.2033,
124
+ "eval_samples_per_second": 56.952,
125
+ "eval_steps_per_second": 7.129,
126
  "step": 300
127
  },
128
  {
129
  "epoch": 2.58,
130
+ "learning_rate": 9.14247311827957e-05,
131
+ "loss": 1.3314,
132
  "step": 320
133
  },
134
  {
135
  "epoch": 2.74,
136
+ "learning_rate": 9.088709677419354e-05,
137
+ "loss": 1.3175,
138
  "step": 340
139
  },
140
  {
141
  "epoch": 2.9,
142
+ "learning_rate": 9.03494623655914e-05,
143
+ "loss": 1.2503,
144
  "step": 360
145
  },
146
  {
147
  "epoch": 3.06,
148
+ "learning_rate": 8.983870967741936e-05,
149
+ "loss": 1.1104,
150
  "step": 380
151
  },
152
  {
153
  "epoch": 3.23,
154
+ "learning_rate": 8.930107526881721e-05,
155
+ "loss": 1.131,
156
  "step": 400
157
  },
158
  {
159
  "epoch": 3.23,
160
+ "eval_accuracy": 0.7856115107913669,
161
+ "eval_loss": 1.0011802911758423,
162
+ "eval_runtime": 12.1771,
163
+ "eval_samples_per_second": 57.074,
164
+ "eval_steps_per_second": 7.145,
165
  "step": 400
166
  },
167
  {
168
  "epoch": 3.39,
169
+ "learning_rate": 8.876344086021506e-05,
170
+ "loss": 1.1084,
171
  "step": 420
172
  },
173
  {
174
  "epoch": 3.55,
175
+ "learning_rate": 8.82258064516129e-05,
176
+ "loss": 0.9978,
177
  "step": 440
178
  },
179
  {
180
  "epoch": 3.71,
181
+ "learning_rate": 8.768817204301076e-05,
182
+ "loss": 1.0633,
183
  "step": 460
184
  },
185
  {
186
  "epoch": 3.87,
187
+ "learning_rate": 8.715053763440861e-05,
188
+ "loss": 0.9801,
189
  "step": 480
190
  },
191
  {
192
  "epoch": 4.03,
193
+ "learning_rate": 8.661290322580645e-05,
194
+ "loss": 0.9239,
195
  "step": 500
196
  },
197
  {
198
  "epoch": 4.03,
199
+ "eval_accuracy": 0.7827338129496403,
200
+ "eval_loss": 0.9055307507514954,
201
+ "eval_runtime": 12.1524,
202
+ "eval_samples_per_second": 57.19,
203
+ "eval_steps_per_second": 7.159,
204
  "step": 500
205
  },
206
  {
207
  "epoch": 4.19,
208
+ "learning_rate": 8.60752688172043e-05,
209
+ "loss": 0.8721,
210
  "step": 520
211
  },
212
  {
213
  "epoch": 4.35,
214
+ "learning_rate": 8.553763440860215e-05,
215
+ "loss": 0.8904,
216
  "step": 540
217
  },
218
  {
219
  "epoch": 4.52,
220
+ "learning_rate": 8.5e-05,
221
+ "loss": 0.8903,
222
  "step": 560
223
  },
224
  {
225
  "epoch": 4.68,
226
+ "learning_rate": 8.446236559139785e-05,
227
+ "loss": 0.8564,
228
  "step": 580
229
  },
230
  {
231
  "epoch": 4.84,
232
+ "learning_rate": 8.39247311827957e-05,
233
+ "loss": 0.8699,
234
  "step": 600
235
  },
236
  {
237
  "epoch": 4.84,
238
+ "eval_accuracy": 0.7827338129496403,
239
+ "eval_loss": 0.8103253841400146,
240
+ "eval_runtime": 11.9742,
241
+ "eval_samples_per_second": 58.041,
242
+ "eval_steps_per_second": 7.266,
243
  "step": 600
244
  },
245
  {
246
  "epoch": 5.0,
247
+ "learning_rate": 8.338709677419355e-05,
248
+ "loss": 0.8503,
249
  "step": 620
250
  },
251
  {
252
  "epoch": 5.16,
253
+ "learning_rate": 8.28494623655914e-05,
254
+ "loss": 0.7251,
255
  "step": 640
256
  },
257
  {
258
  "epoch": 5.32,
259
+ "learning_rate": 8.231182795698926e-05,
260
+ "loss": 0.6939,
261
  "step": 660
262
  },
263
  {
264
  "epoch": 5.48,
265
+ "learning_rate": 8.17741935483871e-05,
266
+ "loss": 0.7263,
267
  "step": 680
268
  },
269
  {
270
  "epoch": 5.65,
271
+ "learning_rate": 8.123655913978495e-05,
272
+ "loss": 0.6707,
273
  "step": 700
274
  },
275
  {
276
  "epoch": 5.65,
277
+ "eval_accuracy": 0.7841726618705036,
278
+ "eval_loss": 0.7609687447547913,
279
+ "eval_runtime": 12.2276,
280
+ "eval_samples_per_second": 56.839,
281
+ "eval_steps_per_second": 7.115,
282
  "step": 700
283
  },
284
  {
285
  "epoch": 5.81,
286
+ "learning_rate": 8.06989247311828e-05,
287
+ "loss": 0.6822,
288
  "step": 720
289
  },
290
  {
291
  "epoch": 5.97,
292
+ "learning_rate": 8.016129032258065e-05,
293
+ "loss": 0.716,
294
  "step": 740
295
  },
296
  {
297
  "epoch": 6.13,
298
+ "learning_rate": 7.96236559139785e-05,
299
+ "loss": 0.6162,
300
  "step": 760
301
  },
302
  {
303
  "epoch": 6.29,
304
+ "learning_rate": 7.908602150537635e-05,
305
+ "loss": 0.6409,
306
  "step": 780
307
  },
308
  {
309
  "epoch": 6.45,
310
+ "learning_rate": 7.85483870967742e-05,
311
+ "loss": 0.6206,
312
  "step": 800
313
  },
314
  {
315
  "epoch": 6.45,
316
+ "eval_accuracy": 0.7884892086330936,
317
+ "eval_loss": 0.7312321066856384,
318
+ "eval_runtime": 12.0325,
319
+ "eval_samples_per_second": 57.76,
320
+ "eval_steps_per_second": 7.23,
321
  "step": 800
322
  },
323
  {
324
  "epoch": 6.61,
325
+ "learning_rate": 7.801075268817205e-05,
326
+ "loss": 0.6156,
327
  "step": 820
328
  },
329
  {
330
  "epoch": 6.77,
331
+ "learning_rate": 7.74731182795699e-05,
332
+ "loss": 0.5887,
333
  "step": 840
334
  },
335
  {
336
  "epoch": 6.94,
337
+ "learning_rate": 7.693548387096776e-05,
338
+ "loss": 0.549,
339
  "step": 860
340
  },
341
  {
342
  "epoch": 7.1,
343
+ "learning_rate": 7.63978494623656e-05,
344
+ "loss": 0.581,
345
  "step": 880
346
  },
347
  {
348
  "epoch": 7.26,
349
+ "learning_rate": 7.586021505376343e-05,
350
+ "loss": 0.5795,
351
  "step": 900
352
  },
353
  {
354
  "epoch": 7.26,
355
+ "eval_accuracy": 0.8100719424460432,
356
+ "eval_loss": 0.6988666653633118,
357
+ "eval_runtime": 12.1203,
358
+ "eval_samples_per_second": 57.342,
359
+ "eval_steps_per_second": 7.178,
360
  "step": 900
361
  },
362
  {
363
  "epoch": 7.42,
364
+ "learning_rate": 7.532258064516129e-05,
365
+ "loss": 0.4974,
366
  "step": 920
367
  },
368
  {
369
  "epoch": 7.58,
370
+ "learning_rate": 7.478494623655914e-05,
371
+ "loss": 0.5397,
372
  "step": 940
373
  },
374
  {
375
  "epoch": 7.74,
376
+ "learning_rate": 7.424731182795699e-05,
377
+ "loss": 0.6128,
378
  "step": 960
379
  },
380
  {
381
  "epoch": 7.9,
382
+ "learning_rate": 7.370967741935485e-05,
383
+ "loss": 0.5246,
384
  "step": 980
385
  },
386
  {
387
  "epoch": 8.06,
388
+ "learning_rate": 7.317204301075268e-05,
389
+ "loss": 0.4914,
390
  "step": 1000
391
  },
392
  {
393
  "epoch": 8.06,
394
  "eval_accuracy": 0.781294964028777,
395
+ "eval_loss": 0.7066284418106079,
396
+ "eval_runtime": 12.169,
397
+ "eval_samples_per_second": 57.112,
398
+ "eval_steps_per_second": 7.149,
399
  "step": 1000
400
  },
401
  {
402
  "epoch": 8.23,
403
+ "learning_rate": 7.263440860215054e-05,
404
+ "loss": 0.4653,
405
  "step": 1020
406
  },
407
  {
408
  "epoch": 8.39,
409
+ "learning_rate": 7.209677419354839e-05,
410
+ "loss": 0.486,
411
  "step": 1040
412
  },
413
  {
414
  "epoch": 8.55,
415
+ "learning_rate": 7.155913978494624e-05,
416
+ "loss": 0.4781,
417
  "step": 1060
418
  },
419
  {
420
  "epoch": 8.71,
421
+ "learning_rate": 7.102150537634408e-05,
422
+ "loss": 0.5277,
423
  "step": 1080
424
  },
425
  {
426
  "epoch": 8.87,
427
+ "learning_rate": 7.048387096774193e-05,
428
+ "loss": 0.5087,
429
  "step": 1100
430
  },
431
  {
432
  "epoch": 8.87,
433
+ "eval_accuracy": 0.818705035971223,
434
+ "eval_loss": 0.6398155689239502,
435
+ "eval_runtime": 12.2159,
436
+ "eval_samples_per_second": 56.893,
437
+ "eval_steps_per_second": 7.122,
438
  "step": 1100
439
  },
440
  {
441
  "epoch": 9.03,
442
+ "learning_rate": 6.994623655913979e-05,
443
+ "loss": 0.4671,
444
  "step": 1120
445
  },
446
  {
447
  "epoch": 9.19,
448
+ "learning_rate": 6.940860215053764e-05,
449
+ "loss": 0.4841,
450
  "step": 1140
451
  },
452
  {
453
  "epoch": 9.35,
454
+ "learning_rate": 6.887096774193549e-05,
455
+ "loss": 0.4619,
456
  "step": 1160
457
  },
458
  {
459
  "epoch": 9.52,
460
+ "learning_rate": 6.833333333333333e-05,
461
+ "loss": 0.479,
462
  "step": 1180
463
  },
464
  {
465
  "epoch": 9.68,
466
+ "learning_rate": 6.779569892473118e-05,
467
+ "loss": 0.4373,
468
  "step": 1200
469
  },
470
  {
471
  "epoch": 9.68,
472
+ "eval_accuracy": 0.8043165467625899,
473
+ "eval_loss": 0.6293413043022156,
474
+ "eval_runtime": 12.0132,
475
+ "eval_samples_per_second": 57.853,
476
+ "eval_steps_per_second": 7.242,
477
  "step": 1200
478
  },
479
  {
480
  "epoch": 9.84,
481
+ "learning_rate": 6.725806451612904e-05,
482
+ "loss": 0.4989,
483
  "step": 1220
484
  },
485
  {
486
  "epoch": 10.0,
487
+ "learning_rate": 6.672043010752689e-05,
488
+ "loss": 0.4829,
489
  "step": 1240
490
  },
491
  {
492
  "epoch": 10.16,
493
+ "learning_rate": 6.618279569892474e-05,
494
+ "loss": 0.458,
495
  "step": 1260
496
  },
497
  {
498
  "epoch": 10.32,
499
+ "learning_rate": 6.564516129032258e-05,
500
+ "loss": 0.4135,
501
  "step": 1280
502
  },
503
  {
504
  "epoch": 10.48,
505
+ "learning_rate": 6.510752688172043e-05,
506
+ "loss": 0.4365,
507
  "step": 1300
508
  },
509
  {
510
  "epoch": 10.48,
511
+ "eval_accuracy": 0.7971223021582734,
512
+ "eval_loss": 0.672641932964325,
513
+ "eval_runtime": 12.1208,
514
+ "eval_samples_per_second": 57.339,
515
+ "eval_steps_per_second": 7.178,
516
  "step": 1300
517
  },
518
  {
519
  "epoch": 10.65,
520
+ "learning_rate": 6.456989247311829e-05,
521
+ "loss": 0.4852,
522
  "step": 1320
523
  },
524
  {
525
  "epoch": 10.81,
526
+ "learning_rate": 6.403225806451614e-05,
527
+ "loss": 0.3868,
528
  "step": 1340
529
  },
530
  {
531
  "epoch": 10.97,
532
+ "learning_rate": 6.349462365591398e-05,
533
+ "loss": 0.4663,
534
  "step": 1360
535
  },
536
  {
537
  "epoch": 11.13,
538
+ "learning_rate": 6.295698924731183e-05,
539
+ "loss": 0.396,
540
  "step": 1380
541
  },
542
  {
543
  "epoch": 11.29,
544
+ "learning_rate": 6.241935483870967e-05,
545
+ "loss": 0.4517,
546
  "step": 1400
547
  },
548
  {
549
  "epoch": 11.29,
550
+ "eval_accuracy": 0.8244604316546763,
551
+ "eval_loss": 0.6046626567840576,
552
+ "eval_runtime": 12.0781,
553
+ "eval_samples_per_second": 57.542,
554
+ "eval_steps_per_second": 7.203,
555
  "step": 1400
556
  },
557
  {
558
  "epoch": 11.45,
559
+ "learning_rate": 6.188172043010752e-05,
560
+ "loss": 0.4272,
561
  "step": 1420
562
  },
563
  {
564
  "epoch": 11.61,
565
+ "learning_rate": 6.134408602150538e-05,
566
+ "loss": 0.3438,
567
  "step": 1440
568
  },
569
  {
570
  "epoch": 11.77,
571
+ "learning_rate": 6.080645161290322e-05,
572
+ "loss": 0.3741,
573
  "step": 1460
574
  },
575
  {
576
  "epoch": 11.94,
577
+ "learning_rate": 6.0268817204301075e-05,
578
+ "loss": 0.3633,
579
  "step": 1480
580
  },
581
  {
582
  "epoch": 12.1,
583
+ "learning_rate": 5.973118279569893e-05,
584
+ "loss": 0.4114,
585
  "step": 1500
586
  },
587
  {
588
  "epoch": 12.1,
589
+ "eval_accuracy": 0.823021582733813,
590
+ "eval_loss": 0.6088372468948364,
591
+ "eval_runtime": 12.1467,
592
+ "eval_samples_per_second": 57.217,
593
+ "eval_steps_per_second": 7.162,
594
  "step": 1500
595
  },
596
  {
597
  "epoch": 12.26,
598
+ "learning_rate": 5.9193548387096774e-05,
599
+ "loss": 0.3284,
600
  "step": 1520
601
  },
602
  {
603
  "epoch": 12.42,
604
+ "learning_rate": 5.8655913978494627e-05,
605
+ "loss": 0.3329,
606
  "step": 1540
607
  },
608
  {
609
  "epoch": 12.58,
610
+ "learning_rate": 5.811827956989247e-05,
611
+ "loss": 0.3656,
612
  "step": 1560
613
  },
614
  {
615
  "epoch": 12.74,
616
+ "learning_rate": 5.7580645161290325e-05,
617
+ "loss": 0.3581,
618
  "step": 1580
619
  },
620
  {
621
  "epoch": 12.9,
622
+ "learning_rate": 5.704301075268818e-05,
623
+ "loss": 0.426,
624
  "step": 1600
625
  },
626
  {
627
  "epoch": 12.9,
628
+ "eval_accuracy": 0.8201438848920863,
629
+ "eval_loss": 0.6164522171020508,
630
+ "eval_runtime": 12.2797,
631
+ "eval_samples_per_second": 56.597,
632
+ "eval_steps_per_second": 7.085,
633
  "step": 1600
634
  },
635
  {
636
  "epoch": 13.06,
637
+ "learning_rate": 5.6505376344086024e-05,
638
+ "loss": 0.4441,
639
  "step": 1620
640
  },
641
  {
642
  "epoch": 13.23,
643
+ "learning_rate": 5.599462365591398e-05,
644
+ "loss": 0.3141,
645
  "step": 1640
646
  },
647
  {
648
  "epoch": 13.39,
649
+ "learning_rate": 5.545698924731183e-05,
650
+ "loss": 0.3072,
651
  "step": 1660
652
  },
653
  {
654
  "epoch": 13.55,
655
+ "learning_rate": 5.491935483870968e-05,
656
+ "loss": 0.3348,
657
  "step": 1680
658
  },
659
  {
660
  "epoch": 13.71,
661
+ "learning_rate": 5.438172043010753e-05,
662
+ "loss": 0.3456,
663
  "step": 1700
664
  },
665
  {
666
  "epoch": 13.71,
667
+ "eval_accuracy": 0.8258992805755395,
668
+ "eval_loss": 0.6133091449737549,
669
+ "eval_runtime": 12.0446,
670
+ "eval_samples_per_second": 57.702,
671
+ "eval_steps_per_second": 7.223,
672
  "step": 1700
673
  },
674
  {
675
  "epoch": 13.87,
676
+ "learning_rate": 5.384408602150538e-05,
677
+ "loss": 0.3338,
678
  "step": 1720
679
  },
680
  {
681
  "epoch": 14.03,
682
+ "learning_rate": 5.330645161290323e-05,
683
+ "loss": 0.3473,
684
  "step": 1740
685
  },
686
  {
687
  "epoch": 14.19,
688
+ "learning_rate": 5.276881720430108e-05,
689
+ "loss": 0.3563,
690
  "step": 1760
691
  },
692
  {
693
  "epoch": 14.35,
694
+ "learning_rate": 5.223118279569893e-05,
695
+ "loss": 0.3086,
696
  "step": 1780
697
  },
698
  {
699
  "epoch": 14.52,
700
+ "learning_rate": 5.169354838709678e-05,
701
+ "loss": 0.332,
702
  "step": 1800
703
  },
704
  {
705
  "epoch": 14.52,
706
+ "eval_accuracy": 0.8201438848920863,
707
+ "eval_loss": 0.6735997200012207,
708
+ "eval_runtime": 11.991,
709
+ "eval_samples_per_second": 57.96,
710
+ "eval_steps_per_second": 7.255,
711
  "step": 1800
712
  },
713
  {
714
  "epoch": 14.68,
715
+ "learning_rate": 5.115591397849463e-05,
716
+ "loss": 0.3146,
717
  "step": 1820
718
  },
719
  {
720
  "epoch": 14.84,
721
+ "learning_rate": 5.061827956989248e-05,
722
+ "loss": 0.3475,
723
  "step": 1840
724
  },
725
  {
726
  "epoch": 15.0,
727
+ "learning_rate": 5.008064516129033e-05,
728
+ "loss": 0.3397,
729
  "step": 1860
730
  },
731
  {
732
  "epoch": 15.16,
733
+ "learning_rate": 4.954301075268817e-05,
734
+ "loss": 0.2829,
735
  "step": 1880
736
  },
737
  {
738
  "epoch": 15.32,
739
+ "learning_rate": 4.9005376344086024e-05,
740
+ "loss": 0.3646,
741
  "step": 1900
742
  },
743
  {
744
  "epoch": 15.32,
745
+ "eval_accuracy": 0.8172661870503597,
746
+ "eval_loss": 0.6405801177024841,
747
+ "eval_runtime": 12.0646,
748
+ "eval_samples_per_second": 57.607,
749
+ "eval_steps_per_second": 7.211,
750
  "step": 1900
751
  },
752
  {
753
  "epoch": 15.48,
754
+ "learning_rate": 4.846774193548387e-05,
755
+ "loss": 0.2571,
756
  "step": 1920
757
  },
758
  {
759
  "epoch": 15.65,
760
+ "learning_rate": 4.793010752688172e-05,
761
+ "loss": 0.3495,
762
  "step": 1940
763
  },
764
  {
765
  "epoch": 15.81,
766
+ "learning_rate": 4.7392473118279576e-05,
767
+ "loss": 0.2647,
768
  "step": 1960
769
  },
770
  {
771
  "epoch": 15.97,
772
+ "learning_rate": 4.685483870967742e-05,
773
+ "loss": 0.3063,
774
  "step": 1980
775
  },
776
  {
777
  "epoch": 16.13,
778
+ "learning_rate": 4.6317204301075275e-05,
779
+ "loss": 0.3287,
780
  "step": 2000
781
  },
782
  {
783
  "epoch": 16.13,
784
+ "eval_accuracy": 0.7971223021582734,
785
+ "eval_loss": 0.6978276968002319,
786
+ "eval_runtime": 12.1198,
787
+ "eval_samples_per_second": 57.344,
788
+ "eval_steps_per_second": 7.178,
789
  "step": 2000
790
  },
791
  {
792
  "epoch": 16.29,
793
+ "learning_rate": 4.577956989247312e-05,
794
+ "loss": 0.2456,
795
  "step": 2020
796
  },
797
  {
798
  "epoch": 16.45,
799
+ "learning_rate": 4.5241935483870966e-05,
800
+ "loss": 0.3242,
801
  "step": 2040
802
  },
803
  {
804
  "epoch": 16.61,
805
+ "learning_rate": 4.470430107526882e-05,
806
+ "loss": 0.3186,
807
  "step": 2060
808
  },
809
  {
810
  "epoch": 16.77,
811
+ "learning_rate": 4.4166666666666665e-05,
812
+ "loss": 0.3462,
813
  "step": 2080
814
  },
815
  {
816
  "epoch": 16.94,
817
+ "learning_rate": 4.362903225806452e-05,
818
+ "loss": 0.2793,
819
  "step": 2100
820
  },
821
  {
822
  "epoch": 16.94,
823
+ "eval_accuracy": 0.8172661870503597,
824
+ "eval_loss": 0.6432561278343201,
825
+ "eval_runtime": 12.0081,
826
+ "eval_samples_per_second": 57.878,
827
+ "eval_steps_per_second": 7.245,
828
  "step": 2100
829
  },
830
  {
831
  "epoch": 17.1,
832
+ "learning_rate": 4.309139784946237e-05,
833
+ "loss": 0.2849,
834
  "step": 2120
835
  },
836
  {
837
  "epoch": 17.26,
838
+ "learning_rate": 4.2553763440860216e-05,
839
+ "loss": 0.3371,
840
  "step": 2140
841
  },
842
  {
843
  "epoch": 17.42,
844
+ "learning_rate": 4.201612903225807e-05,
845
+ "loss": 0.2561,
846
  "step": 2160
847
  },
848
  {
849
  "epoch": 17.58,
850
+ "learning_rate": 4.1478494623655915e-05,
851
+ "loss": 0.2628,
852
  "step": 2180
853
  },
854
  {
855
  "epoch": 17.74,
856
+ "learning_rate": 4.094086021505377e-05,
857
+ "loss": 0.2924,
858
  "step": 2200
859
  },
860
  {
861
  "epoch": 17.74,
862
+ "eval_accuracy": 0.814388489208633,
863
+ "eval_loss": 0.6474089622497559,
864
+ "eval_runtime": 12.255,
865
+ "eval_samples_per_second": 56.711,
866
+ "eval_steps_per_second": 7.099,
867
  "step": 2200
868
  },
869
  {
870
  "epoch": 17.9,
871
+ "learning_rate": 4.0403225806451614e-05,
872
+ "loss": 0.2935,
873
  "step": 2220
874
  },
875
  {
876
  "epoch": 18.06,
877
+ "learning_rate": 3.9865591397849466e-05,
878
+ "loss": 0.5124,
879
  "step": 2240
880
  },
881
  {
882
  "epoch": 18.23,
883
+ "learning_rate": 3.932795698924731e-05,
884
+ "loss": 0.2954,
885
  "step": 2260
886
  },
887
  {
888
  "epoch": 18.39,
889
+ "learning_rate": 3.879032258064516e-05,
890
+ "loss": 0.2584,
891
  "step": 2280
892
  },
893
  {
894
  "epoch": 18.55,
895
+ "learning_rate": 3.825268817204301e-05,
896
+ "loss": 0.2605,
897
  "step": 2300
898
  },
899
  {
900
  "epoch": 18.55,
901
+ "eval_accuracy": 0.8287769784172662,
902
+ "eval_loss": 0.627909243106842,
903
+ "eval_runtime": 12.1775,
904
+ "eval_samples_per_second": 57.072,
905
+ "eval_steps_per_second": 7.144,
906
  "step": 2300
907
  },
908
  {
909
  "epoch": 18.71,
910
+ "learning_rate": 3.7715053763440864e-05,
911
+ "loss": 0.312,
912
  "step": 2320
913
  },
914
  {
915
  "epoch": 18.87,
916
+ "learning_rate": 3.717741935483871e-05,
917
+ "loss": 0.2679,
918
  "step": 2340
919
  },
920
  {
921
  "epoch": 19.03,
922
+ "learning_rate": 3.663978494623656e-05,
923
+ "loss": 0.2654,
924
  "step": 2360
925
  },
926
  {
927
  "epoch": 19.19,
928
+ "learning_rate": 3.610215053763441e-05,
929
+ "loss": 0.2524,
930
  "step": 2380
931
  },
932
  {
933
  "epoch": 19.35,
934
+ "learning_rate": 3.556451612903226e-05,
935
+ "loss": 0.2016,
936
  "step": 2400
937
  },
938
  {
939
  "epoch": 19.35,
940
+ "eval_accuracy": 0.8215827338129497,
941
+ "eval_loss": 0.6360746026039124,
942
+ "eval_runtime": 12.0929,
943
+ "eval_samples_per_second": 57.472,
944
+ "eval_steps_per_second": 7.194,
945
  "step": 2400
946
  },
947
  {
948
  "epoch": 19.52,
949
+ "learning_rate": 3.502688172043011e-05,
950
+ "loss": 0.2691,
951
  "step": 2420
952
  },
953
  {
954
  "epoch": 19.68,
955
+ "learning_rate": 3.448924731182796e-05,
956
+ "loss": 0.2068,
957
  "step": 2440
958
  },
959
  {
960
  "epoch": 19.84,
961
+ "learning_rate": 3.395161290322581e-05,
962
+ "loss": 0.3017,
963
  "step": 2460
964
  },
965
  {
966
  "epoch": 20.0,
967
+ "learning_rate": 3.341397849462366e-05,
968
+ "loss": 0.2318,
969
  "step": 2480
970
  },
971
  {
972
+ "epoch": 20.16,
973
+ "learning_rate": 3.2876344086021504e-05,
974
+ "loss": 0.2524,
975
+ "step": 2500
976
+ },
977
+ {
978
+ "epoch": 20.16,
979
+ "eval_accuracy": 0.8258992805755395,
980
+ "eval_loss": 0.6393768191337585,
981
+ "eval_runtime": 12.2058,
982
+ "eval_samples_per_second": 56.94,
983
+ "eval_steps_per_second": 7.128,
984
+ "step": 2500
985
+ },
986
+ {
987
+ "epoch": 20.32,
988
+ "learning_rate": 3.233870967741936e-05,
989
+ "loss": 0.2603,
990
+ "step": 2520
991
+ },
992
+ {
993
+ "epoch": 20.48,
994
+ "learning_rate": 3.18010752688172e-05,
995
+ "loss": 0.207,
996
+ "step": 2540
997
+ },
998
+ {
999
+ "epoch": 20.65,
1000
+ "learning_rate": 3.1263440860215056e-05,
1001
+ "loss": 0.1927,
1002
+ "step": 2560
1003
+ },
1004
+ {
1005
+ "epoch": 20.81,
1006
+ "learning_rate": 3.07258064516129e-05,
1007
+ "loss": 0.1947,
1008
+ "step": 2580
1009
+ },
1010
+ {
1011
+ "epoch": 20.97,
1012
+ "learning_rate": 3.0188172043010754e-05,
1013
+ "loss": 0.2017,
1014
+ "step": 2600
1015
+ },
1016
+ {
1017
+ "epoch": 20.97,
1018
+ "eval_accuracy": 0.8158273381294964,
1019
+ "eval_loss": 0.6682714223861694,
1020
+ "eval_runtime": 12.0188,
1021
+ "eval_samples_per_second": 57.826,
1022
+ "eval_steps_per_second": 7.239,
1023
+ "step": 2600
1024
+ },
1025
+ {
1026
+ "epoch": 21.13,
1027
+ "learning_rate": 2.9650537634408604e-05,
1028
+ "loss": 0.2351,
1029
+ "step": 2620
1030
+ },
1031
+ {
1032
+ "epoch": 21.29,
1033
+ "learning_rate": 2.9112903225806453e-05,
1034
+ "loss": 0.1842,
1035
+ "step": 2640
1036
+ },
1037
+ {
1038
+ "epoch": 21.45,
1039
+ "learning_rate": 2.8575268817204302e-05,
1040
+ "loss": 0.2131,
1041
+ "step": 2660
1042
+ },
1043
+ {
1044
+ "epoch": 21.61,
1045
+ "learning_rate": 2.8037634408602155e-05,
1046
+ "loss": 0.2509,
1047
+ "step": 2680
1048
+ },
1049
+ {
1050
+ "epoch": 21.77,
1051
+ "learning_rate": 2.7500000000000004e-05,
1052
+ "loss": 0.2082,
1053
+ "step": 2700
1054
+ },
1055
+ {
1056
+ "epoch": 21.77,
1057
+ "eval_accuracy": 0.8345323741007195,
1058
+ "eval_loss": 0.6389289498329163,
1059
+ "eval_runtime": 12.0715,
1060
+ "eval_samples_per_second": 57.574,
1061
+ "eval_steps_per_second": 7.207,
1062
+ "step": 2700
1063
+ },
1064
+ {
1065
+ "epoch": 21.94,
1066
+ "learning_rate": 2.6962365591397854e-05,
1067
+ "loss": 0.2671,
1068
+ "step": 2720
1069
+ },
1070
+ {
1071
+ "epoch": 22.1,
1072
+ "learning_rate": 2.6424731182795696e-05,
1073
+ "loss": 0.2534,
1074
+ "step": 2740
1075
+ },
1076
+ {
1077
+ "epoch": 22.26,
1078
+ "learning_rate": 2.588709677419355e-05,
1079
+ "loss": 0.205,
1080
+ "step": 2760
1081
+ },
1082
+ {
1083
+ "epoch": 22.42,
1084
+ "learning_rate": 2.5349462365591398e-05,
1085
+ "loss": 0.1904,
1086
+ "step": 2780
1087
+ },
1088
+ {
1089
+ "epoch": 22.58,
1090
+ "learning_rate": 2.4811827956989248e-05,
1091
+ "loss": 0.2751,
1092
+ "step": 2800
1093
+ },
1094
+ {
1095
+ "epoch": 22.58,
1096
+ "eval_accuracy": 0.837410071942446,
1097
+ "eval_loss": 0.6141177415847778,
1098
+ "eval_runtime": 12.0253,
1099
+ "eval_samples_per_second": 57.795,
1100
+ "eval_steps_per_second": 7.235,
1101
+ "step": 2800
1102
+ },
1103
+ {
1104
+ "epoch": 22.74,
1105
+ "learning_rate": 2.4274193548387097e-05,
1106
+ "loss": 0.1863,
1107
+ "step": 2820
1108
+ },
1109
+ {
1110
+ "epoch": 22.9,
1111
+ "learning_rate": 2.3736559139784946e-05,
1112
+ "loss": 0.2385,
1113
+ "step": 2840
1114
+ },
1115
+ {
1116
+ "epoch": 23.06,
1117
+ "learning_rate": 2.31989247311828e-05,
1118
+ "loss": 0.1666,
1119
+ "step": 2860
1120
+ },
1121
+ {
1122
+ "epoch": 23.23,
1123
+ "learning_rate": 2.266129032258065e-05,
1124
+ "loss": 0.2087,
1125
+ "step": 2880
1126
+ },
1127
+ {
1128
+ "epoch": 23.39,
1129
+ "learning_rate": 2.2123655913978494e-05,
1130
+ "loss": 0.207,
1131
+ "step": 2900
1132
+ },
1133
+ {
1134
+ "epoch": 23.39,
1135
+ "eval_accuracy": 0.8258992805755395,
1136
+ "eval_loss": 0.6051694750785828,
1137
+ "eval_runtime": 12.011,
1138
+ "eval_samples_per_second": 57.864,
1139
+ "eval_steps_per_second": 7.243,
1140
+ "step": 2900
1141
+ },
1142
+ {
1143
+ "epoch": 23.55,
1144
+ "learning_rate": 2.1586021505376344e-05,
1145
+ "loss": 0.2026,
1146
+ "step": 2920
1147
+ },
1148
+ {
1149
+ "epoch": 23.71,
1150
+ "learning_rate": 2.1048387096774193e-05,
1151
+ "loss": 0.2073,
1152
+ "step": 2940
1153
+ },
1154
+ {
1155
+ "epoch": 23.87,
1156
+ "learning_rate": 2.0510752688172046e-05,
1157
+ "loss": 0.1829,
1158
+ "step": 2960
1159
+ },
1160
+ {
1161
+ "epoch": 24.03,
1162
+ "learning_rate": 1.9973118279569895e-05,
1163
+ "loss": 0.2048,
1164
+ "step": 2980
1165
+ },
1166
+ {
1167
+ "epoch": 24.19,
1168
+ "learning_rate": 1.9435483870967744e-05,
1169
+ "loss": 0.1791,
1170
+ "step": 3000
1171
+ },
1172
+ {
1173
+ "epoch": 24.19,
1174
+ "eval_accuracy": 0.823021582733813,
1175
+ "eval_loss": 0.6331909894943237,
1176
+ "eval_runtime": 12.2518,
1177
+ "eval_samples_per_second": 56.726,
1178
+ "eval_steps_per_second": 7.101,
1179
+ "step": 3000
1180
+ },
1181
+ {
1182
+ "epoch": 24.35,
1183
+ "learning_rate": 1.889784946236559e-05,
1184
+ "loss": 0.2243,
1185
+ "step": 3020
1186
+ },
1187
+ {
1188
+ "epoch": 24.52,
1189
+ "learning_rate": 1.836021505376344e-05,
1190
+ "loss": 0.2148,
1191
+ "step": 3040
1192
+ },
1193
+ {
1194
+ "epoch": 24.68,
1195
+ "learning_rate": 1.7822580645161292e-05,
1196
+ "loss": 0.2568,
1197
+ "step": 3060
1198
+ },
1199
+ {
1200
+ "epoch": 24.84,
1201
+ "learning_rate": 1.728494623655914e-05,
1202
+ "loss": 0.1782,
1203
+ "step": 3080
1204
+ },
1205
+ {
1206
+ "epoch": 25.0,
1207
+ "learning_rate": 1.674731182795699e-05,
1208
+ "loss": 0.1719,
1209
+ "step": 3100
1210
+ },
1211
+ {
1212
+ "epoch": 25.0,
1213
+ "eval_accuracy": 0.8402877697841726,
1214
+ "eval_loss": 0.5942012071609497,
1215
+ "eval_runtime": 12.1124,
1216
+ "eval_samples_per_second": 57.379,
1217
+ "eval_steps_per_second": 7.183,
1218
+ "step": 3100
1219
+ },
1220
+ {
1221
+ "epoch": 25.16,
1222
+ "learning_rate": 1.620967741935484e-05,
1223
+ "loss": 0.1881,
1224
+ "step": 3120
1225
+ },
1226
+ {
1227
+ "epoch": 25.32,
1228
+ "learning_rate": 1.5672043010752686e-05,
1229
+ "loss": 0.2375,
1230
+ "step": 3140
1231
+ },
1232
+ {
1233
+ "epoch": 25.48,
1234
+ "learning_rate": 1.5134408602150537e-05,
1235
+ "loss": 0.1779,
1236
+ "step": 3160
1237
+ },
1238
+ {
1239
+ "epoch": 25.65,
1240
+ "learning_rate": 1.4596774193548388e-05,
1241
+ "loss": 0.1753,
1242
+ "step": 3180
1243
+ },
1244
+ {
1245
+ "epoch": 25.81,
1246
+ "learning_rate": 1.4059139784946238e-05,
1247
+ "loss": 0.1685,
1248
+ "step": 3200
1249
+ },
1250
+ {
1251
+ "epoch": 25.81,
1252
+ "eval_accuracy": 0.8359712230215828,
1253
+ "eval_loss": 0.612082839012146,
1254
+ "eval_runtime": 12.1399,
1255
+ "eval_samples_per_second": 57.249,
1256
+ "eval_steps_per_second": 7.166,
1257
+ "step": 3200
1258
+ },
1259
+ {
1260
+ "epoch": 25.97,
1261
+ "learning_rate": 1.3521505376344087e-05,
1262
+ "loss": 0.1407,
1263
+ "step": 3220
1264
+ },
1265
+ {
1266
+ "epoch": 26.13,
1267
+ "learning_rate": 1.2983870967741938e-05,
1268
+ "loss": 0.1854,
1269
+ "step": 3240
1270
+ },
1271
+ {
1272
+ "epoch": 26.29,
1273
+ "learning_rate": 1.2446236559139786e-05,
1274
+ "loss": 0.1956,
1275
+ "step": 3260
1276
+ },
1277
+ {
1278
+ "epoch": 26.45,
1279
+ "learning_rate": 1.1908602150537635e-05,
1280
+ "loss": 0.2,
1281
+ "step": 3280
1282
+ },
1283
+ {
1284
+ "epoch": 26.61,
1285
+ "learning_rate": 1.1370967741935484e-05,
1286
+ "loss": 0.1557,
1287
+ "step": 3300
1288
+ },
1289
+ {
1290
+ "epoch": 26.61,
1291
+ "eval_accuracy": 0.8345323741007195,
1292
+ "eval_loss": 0.6236761212348938,
1293
+ "eval_runtime": 11.9687,
1294
+ "eval_samples_per_second": 58.068,
1295
+ "eval_steps_per_second": 7.269,
1296
+ "step": 3300
1297
+ },
1298
+ {
1299
+ "epoch": 26.77,
1300
+ "learning_rate": 1.0833333333333334e-05,
1301
+ "loss": 0.212,
1302
+ "step": 3320
1303
+ },
1304
+ {
1305
+ "epoch": 26.94,
1306
+ "learning_rate": 1.0295698924731183e-05,
1307
+ "loss": 0.1893,
1308
+ "step": 3340
1309
+ },
1310
+ {
1311
+ "epoch": 27.1,
1312
+ "learning_rate": 9.758064516129032e-06,
1313
+ "loss": 0.159,
1314
+ "step": 3360
1315
+ },
1316
+ {
1317
+ "epoch": 27.26,
1318
+ "learning_rate": 9.220430107526883e-06,
1319
+ "loss": 0.1755,
1320
+ "step": 3380
1321
+ },
1322
+ {
1323
+ "epoch": 27.42,
1324
+ "learning_rate": 8.68279569892473e-06,
1325
+ "loss": 0.1694,
1326
+ "step": 3400
1327
+ },
1328
+ {
1329
+ "epoch": 27.42,
1330
+ "eval_accuracy": 0.8316546762589928,
1331
+ "eval_loss": 0.6371967792510986,
1332
+ "eval_runtime": 12.2075,
1333
+ "eval_samples_per_second": 56.932,
1334
+ "eval_steps_per_second": 7.127,
1335
+ "step": 3400
1336
+ },
1337
+ {
1338
+ "epoch": 27.58,
1339
+ "learning_rate": 8.145161290322582e-06,
1340
+ "loss": 0.1552,
1341
+ "step": 3420
1342
+ },
1343
+ {
1344
+ "epoch": 27.74,
1345
+ "learning_rate": 7.607526881720431e-06,
1346
+ "loss": 0.1848,
1347
+ "step": 3440
1348
+ },
1349
+ {
1350
+ "epoch": 27.9,
1351
+ "learning_rate": 7.06989247311828e-06,
1352
+ "loss": 0.1467,
1353
+ "step": 3460
1354
+ },
1355
+ {
1356
+ "epoch": 28.06,
1357
+ "learning_rate": 6.532258064516129e-06,
1358
+ "loss": 0.1292,
1359
+ "step": 3480
1360
+ },
1361
+ {
1362
+ "epoch": 28.23,
1363
+ "learning_rate": 5.994623655913978e-06,
1364
+ "loss": 0.1927,
1365
+ "step": 3500
1366
+ },
1367
+ {
1368
+ "epoch": 28.23,
1369
+ "eval_accuracy": 0.8273381294964028,
1370
+ "eval_loss": 0.6377986073493958,
1371
+ "eval_runtime": 12.0256,
1372
+ "eval_samples_per_second": 57.793,
1373
+ "eval_steps_per_second": 7.235,
1374
+ "step": 3500
1375
+ },
1376
+ {
1377
+ "epoch": 28.39,
1378
+ "learning_rate": 5.4569892473118285e-06,
1379
+ "loss": 0.2078,
1380
+ "step": 3520
1381
+ },
1382
+ {
1383
+ "epoch": 28.55,
1384
+ "learning_rate": 4.919354838709678e-06,
1385
+ "loss": 0.1992,
1386
+ "step": 3540
1387
+ },
1388
+ {
1389
+ "epoch": 28.71,
1390
+ "learning_rate": 4.381720430107527e-06,
1391
+ "loss": 0.1773,
1392
+ "step": 3560
1393
+ },
1394
+ {
1395
+ "epoch": 28.87,
1396
+ "learning_rate": 3.8440860215053765e-06,
1397
+ "loss": 0.1334,
1398
+ "step": 3580
1399
+ },
1400
+ {
1401
+ "epoch": 29.03,
1402
+ "learning_rate": 3.3064516129032262e-06,
1403
+ "loss": 0.1375,
1404
+ "step": 3600
1405
+ },
1406
+ {
1407
+ "epoch": 29.03,
1408
+ "eval_accuracy": 0.8330935251798561,
1409
+ "eval_loss": 0.6257502436637878,
1410
+ "eval_runtime": 12.1079,
1411
+ "eval_samples_per_second": 57.4,
1412
+ "eval_steps_per_second": 7.185,
1413
+ "step": 3600
1414
+ },
1415
+ {
1416
+ "epoch": 29.19,
1417
+ "learning_rate": 2.768817204301075e-06,
1418
+ "loss": 0.1842,
1419
+ "step": 3620
1420
+ },
1421
+ {
1422
+ "epoch": 29.35,
1423
+ "learning_rate": 2.231182795698925e-06,
1424
+ "loss": 0.169,
1425
+ "step": 3640
1426
+ },
1427
+ {
1428
+ "epoch": 29.52,
1429
+ "learning_rate": 1.6935483870967744e-06,
1430
+ "loss": 0.1623,
1431
+ "step": 3660
1432
+ },
1433
+ {
1434
+ "epoch": 29.68,
1435
+ "learning_rate": 1.1559139784946237e-06,
1436
+ "loss": 0.1658,
1437
+ "step": 3680
1438
+ },
1439
+ {
1440
+ "epoch": 29.84,
1441
+ "learning_rate": 6.182795698924732e-07,
1442
+ "loss": 0.1653,
1443
+ "step": 3700
1444
+ },
1445
+ {
1446
+ "epoch": 29.84,
1447
+ "eval_accuracy": 0.8330935251798561,
1448
+ "eval_loss": 0.626188337802887,
1449
+ "eval_runtime": 12.1517,
1450
+ "eval_samples_per_second": 57.193,
1451
+ "eval_steps_per_second": 7.159,
1452
+ "step": 3700
1453
+ },
1454
+ {
1455
+ "epoch": 30.0,
1456
+ "learning_rate": 8.064516129032259e-08,
1457
+ "loss": 0.1634,
1458
+ "step": 3720
1459
+ },
1460
+ {
1461
+ "epoch": 30.0,
1462
+ "step": 3720,
1463
+ "total_flos": 9.155203906807849e+18,
1464
+ "train_loss": 0.5316014153983003,
1465
+ "train_runtime": 4018.8325,
1466
+ "train_samples_per_second": 29.389,
1467
+ "train_steps_per_second": 0.926
1468
  }
1469
  ],
1470
  "logging_steps": 20,
1471
+ "max_steps": 3720,
1472
+ "num_train_epochs": 30,
1473
  "save_steps": 100,
1474
+ "total_flos": 9.155203906807849e+18,
1475
  "trial_name": null,
1476
  "trial_params": null
1477
  }