ljttw commited on
Commit
9bdfd77
·
verified ·
1 Parent(s): 03acad4

Model save

Browse files
Files changed (3) hide show
  1. all_results.json +4 -4
  2. train_results.json +4 -4
  3. trainer_state.json +518 -518
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 2.9987959060806744,
3
  "total_flos": 2.4970885598061527e+19,
4
- "train_loss": 0.046114599592564115,
5
- "train_runtime": 4604.1373,
6
- "train_samples_per_second": 69.254,
7
- "train_steps_per_second": 0.541
8
  }
 
1
  {
2
  "epoch": 2.9987959060806744,
3
  "total_flos": 2.4970885598061527e+19,
4
+ "train_loss": 0.05150821726141782,
5
+ "train_runtime": 5585.3207,
6
+ "train_samples_per_second": 57.088,
7
+ "train_steps_per_second": 0.446
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 2.9987959060806744,
3
  "total_flos": 2.4970885598061527e+19,
4
- "train_loss": 0.046114599592564115,
5
- "train_runtime": 4604.1373,
6
- "train_samples_per_second": 69.254,
7
- "train_steps_per_second": 0.541
8
  }
 
1
  {
2
  "epoch": 2.9987959060806744,
3
  "total_flos": 2.4970885598061527e+19,
4
+ "train_loss": 0.05150821726141782,
5
+ "train_runtime": 5585.3207,
6
+ "train_samples_per_second": 57.088,
7
+ "train_steps_per_second": 0.446
8
  }
trainer_state.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "best_metric": 0.9959466783459585,
3
  "best_model_checkpoint": "swin-base-patch4-window7-224-finetuned-eurosat/checkpoint-2490",
4
  "epoch": 2.9987959060806744,
5
  "eval_steps": 500,
@@ -10,1782 +10,1782 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.012040939193257074,
13
- "grad_norm": 32.612525939941406,
14
  "learning_rate": 2.0080321285140564e-06,
15
- "loss": 1.3843,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.024081878386514148,
20
- "grad_norm": 36.75254440307617,
21
  "learning_rate": 4.016064257028113e-06,
22
- "loss": 1.1816,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.036122817579771226,
27
- "grad_norm": 42.68634033203125,
28
  "learning_rate": 6.024096385542169e-06,
29
- "loss": 0.8859,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.048163756773028296,
34
- "grad_norm": 35.38270568847656,
35
  "learning_rate": 8.032128514056226e-06,
36
- "loss": 0.5628,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.060204695966285374,
41
- "grad_norm": 29.919187545776367,
42
  "learning_rate": 1.0040160642570281e-05,
43
- "loss": 0.3711,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.07224563515954245,
48
- "grad_norm": 40.82660675048828,
49
  "learning_rate": 1.2048192771084338e-05,
50
- "loss": 0.2547,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.08428657435279951,
55
- "grad_norm": 36.59829330444336,
56
  "learning_rate": 1.4056224899598394e-05,
57
- "loss": 0.1965,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.09632751354605659,
62
- "grad_norm": 33.438053131103516,
63
  "learning_rate": 1.606425702811245e-05,
64
- "loss": 0.1394,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.10836845273931367,
69
- "grad_norm": 18.46332359313965,
70
  "learning_rate": 1.8072289156626505e-05,
71
- "loss": 0.1197,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.12040939193257075,
76
- "grad_norm": 33.16898727416992,
77
  "learning_rate": 2.0080321285140562e-05,
78
- "loss": 0.1488,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.13245033112582782,
83
- "grad_norm": 12.674606323242188,
84
  "learning_rate": 2.208835341365462e-05,
85
- "loss": 0.1066,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.1444912703190849,
90
- "grad_norm": 21.0738582611084,
91
  "learning_rate": 2.4096385542168677e-05,
92
- "loss": 0.0717,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.15653220951234195,
97
- "grad_norm": 41.47367858886719,
98
  "learning_rate": 2.6104417670682734e-05,
99
- "loss": 0.1077,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.16857314870559903,
104
- "grad_norm": 25.500967025756836,
105
  "learning_rate": 2.8112449799196788e-05,
106
- "loss": 0.1064,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.1806140878988561,
111
- "grad_norm": 35.477237701416016,
112
  "learning_rate": 3.012048192771085e-05,
113
- "loss": 0.0799,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.19265502709211318,
118
- "grad_norm": 14.265376091003418,
119
  "learning_rate": 3.21285140562249e-05,
120
- "loss": 0.0881,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.20469596628537026,
125
- "grad_norm": 11.8284912109375,
126
  "learning_rate": 3.413654618473896e-05,
127
- "loss": 0.0879,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.21673690547862734,
132
- "grad_norm": 31.987476348876953,
133
  "learning_rate": 3.614457831325301e-05,
134
- "loss": 0.1052,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.22877784467188442,
139
- "grad_norm": 15.683510780334473,
140
  "learning_rate": 3.815261044176707e-05,
141
- "loss": 0.0805,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.2408187838651415,
146
- "grad_norm": 19.198974609375,
147
  "learning_rate": 4.0160642570281125e-05,
148
- "loss": 0.0735,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.25285972305839854,
153
- "grad_norm": 10.244300842285156,
154
  "learning_rate": 4.2168674698795186e-05,
155
- "loss": 0.0908,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.26490066225165565,
160
- "grad_norm": 22.921781539916992,
161
  "learning_rate": 4.417670682730924e-05,
162
- "loss": 0.0797,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.2769416014449127,
167
- "grad_norm": 22.745960235595703,
168
  "learning_rate": 4.61847389558233e-05,
169
- "loss": 0.0737,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.2889825406381698,
174
- "grad_norm": 11.816954612731934,
175
  "learning_rate": 4.8192771084337354e-05,
176
- "loss": 0.0699,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.30102347983142685,
181
- "grad_norm": 14.574070930480957,
182
  "learning_rate": 4.9977688531905406e-05,
183
- "loss": 0.0982,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.3130644190246839,
188
- "grad_norm": 12.386483192443848,
189
  "learning_rate": 4.97545738509594e-05,
190
- "loss": 0.0698,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.325105358217941,
195
- "grad_norm": 9.17690658569336,
196
  "learning_rate": 4.953145917001339e-05,
197
- "loss": 0.0679,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.33714629741119806,
202
- "grad_norm": 12.940923690795898,
203
  "learning_rate": 4.930834448906738e-05,
204
- "loss": 0.064,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.34918723660445516,
209
- "grad_norm": 5.107398986816406,
210
  "learning_rate": 4.908522980812137e-05,
211
- "loss": 0.0761,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.3612281757977122,
216
- "grad_norm": 17.51556396484375,
217
  "learning_rate": 4.886211512717537e-05,
218
- "loss": 0.0725,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.3732691149909693,
223
- "grad_norm": 14.238765716552734,
224
  "learning_rate": 4.8639000446229364e-05,
225
- "loss": 0.0655,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.38531005418422637,
230
- "grad_norm": 14.03766918182373,
231
  "learning_rate": 4.8415885765283355e-05,
232
- "loss": 0.0678,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.3973509933774834,
237
- "grad_norm": 5.484993934631348,
238
  "learning_rate": 4.8192771084337354e-05,
239
- "loss": 0.0663,
240
  "step": 330
241
  },
242
  {
243
  "epoch": 0.4093919325707405,
244
- "grad_norm": 5.191390037536621,
245
  "learning_rate": 4.7969656403391346e-05,
246
- "loss": 0.0673,
247
  "step": 340
248
  },
249
  {
250
  "epoch": 0.4214328717639976,
251
- "grad_norm": 13.674844741821289,
252
  "learning_rate": 4.774654172244534e-05,
253
- "loss": 0.0317,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 0.4334738109572547,
258
- "grad_norm": 10.170462608337402,
259
  "learning_rate": 4.7523427041499336e-05,
260
- "loss": 0.0554,
261
  "step": 360
262
  },
263
  {
264
  "epoch": 0.44551475015051173,
265
- "grad_norm": 14.504225730895996,
266
  "learning_rate": 4.730031236055333e-05,
267
- "loss": 0.045,
268
  "step": 370
269
  },
270
  {
271
  "epoch": 0.45755568934376883,
272
- "grad_norm": 11.081159591674805,
273
  "learning_rate": 4.707719767960732e-05,
274
- "loss": 0.0788,
275
  "step": 380
276
  },
277
  {
278
  "epoch": 0.4695966285370259,
279
- "grad_norm": 8.752190589904785,
280
  "learning_rate": 4.685408299866131e-05,
281
- "loss": 0.0718,
282
  "step": 390
283
  },
284
  {
285
  "epoch": 0.481637567730283,
286
- "grad_norm": 12.14074420928955,
287
  "learning_rate": 4.663096831771531e-05,
288
- "loss": 0.0587,
289
  "step": 400
290
  },
291
  {
292
  "epoch": 0.49367850692354004,
293
- "grad_norm": 16.538904190063477,
294
  "learning_rate": 4.64078536367693e-05,
295
- "loss": 0.0565,
296
  "step": 410
297
  },
298
  {
299
  "epoch": 0.5057194461167971,
300
- "grad_norm": 16.974504470825195,
301
  "learning_rate": 4.61847389558233e-05,
302
- "loss": 0.0389,
303
  "step": 420
304
  },
305
  {
306
  "epoch": 0.5177603853100542,
307
- "grad_norm": 16.62447166442871,
308
  "learning_rate": 4.596162427487729e-05,
309
- "loss": 0.083,
310
  "step": 430
311
  },
312
  {
313
  "epoch": 0.5298013245033113,
314
- "grad_norm": 5.195260047912598,
315
  "learning_rate": 4.5738509593931284e-05,
316
- "loss": 0.037,
317
  "step": 440
318
  },
319
  {
320
  "epoch": 0.5418422636965683,
321
- "grad_norm": 24.313247680664062,
322
  "learning_rate": 4.5515394912985275e-05,
323
- "loss": 0.0818,
324
  "step": 450
325
  },
326
  {
327
  "epoch": 0.5538832028898254,
328
- "grad_norm": 10.289843559265137,
329
  "learning_rate": 4.529228023203927e-05,
330
- "loss": 0.0441,
331
  "step": 460
332
  },
333
  {
334
  "epoch": 0.5659241420830825,
335
- "grad_norm": 10.024016380310059,
336
  "learning_rate": 4.506916555109326e-05,
337
- "loss": 0.0525,
338
  "step": 470
339
  },
340
  {
341
  "epoch": 0.5779650812763396,
342
- "grad_norm": 17.331464767456055,
343
  "learning_rate": 4.484605087014726e-05,
344
- "loss": 0.0597,
345
  "step": 480
346
  },
347
  {
348
  "epoch": 0.5900060204695966,
349
- "grad_norm": 12.607904434204102,
350
  "learning_rate": 4.4622936189201256e-05,
351
- "loss": 0.0565,
352
  "step": 490
353
  },
354
  {
355
  "epoch": 0.6020469596628537,
356
- "grad_norm": 8.530034065246582,
357
  "learning_rate": 4.439982150825525e-05,
358
- "loss": 0.0355,
359
  "step": 500
360
  },
361
  {
362
  "epoch": 0.6140878988561108,
363
- "grad_norm": 14.896183013916016,
364
  "learning_rate": 4.417670682730924e-05,
365
- "loss": 0.0381,
366
  "step": 510
367
  },
368
  {
369
  "epoch": 0.6261288380493678,
370
- "grad_norm": 4.303857326507568,
371
  "learning_rate": 4.395359214636323e-05,
372
- "loss": 0.0504,
373
  "step": 520
374
  },
375
  {
376
  "epoch": 0.6381697772426249,
377
- "grad_norm": 7.028829097747803,
378
  "learning_rate": 4.373047746541722e-05,
379
- "loss": 0.0313,
380
  "step": 530
381
  },
382
  {
383
  "epoch": 0.650210716435882,
384
- "grad_norm": 0.6708688139915466,
385
  "learning_rate": 4.350736278447122e-05,
386
- "loss": 0.0263,
387
  "step": 540
388
  },
389
  {
390
  "epoch": 0.6622516556291391,
391
- "grad_norm": 21.792503356933594,
392
  "learning_rate": 4.328424810352521e-05,
393
- "loss": 0.0337,
394
  "step": 550
395
  },
396
  {
397
  "epoch": 0.6742925948223961,
398
- "grad_norm": 30.61802101135254,
399
  "learning_rate": 4.306113342257921e-05,
400
- "loss": 0.0514,
401
  "step": 560
402
  },
403
  {
404
  "epoch": 0.6863335340156532,
405
- "grad_norm": 15.264066696166992,
406
  "learning_rate": 4.2838018741633203e-05,
407
- "loss": 0.0565,
408
  "step": 570
409
  },
410
  {
411
  "epoch": 0.6983744732089103,
412
- "grad_norm": 9.57032299041748,
413
  "learning_rate": 4.2614904060687195e-05,
414
- "loss": 0.0425,
415
  "step": 580
416
  },
417
  {
418
  "epoch": 0.7104154124021673,
419
- "grad_norm": 16.22507095336914,
420
  "learning_rate": 4.239178937974119e-05,
421
- "loss": 0.0305,
422
  "step": 590
423
  },
424
  {
425
  "epoch": 0.7224563515954244,
426
- "grad_norm": 18.800668716430664,
427
  "learning_rate": 4.2168674698795186e-05,
428
- "loss": 0.0506,
429
  "step": 600
430
  },
431
  {
432
  "epoch": 0.7344972907886815,
433
- "grad_norm": 21.669200897216797,
434
  "learning_rate": 4.194556001784918e-05,
435
- "loss": 0.0413,
436
  "step": 610
437
  },
438
  {
439
  "epoch": 0.7465382299819386,
440
- "grad_norm": 19.009016036987305,
441
  "learning_rate": 4.172244533690317e-05,
442
- "loss": 0.0296,
443
  "step": 620
444
  },
445
  {
446
  "epoch": 0.7585791691751956,
447
- "grad_norm": 1.3789639472961426,
448
  "learning_rate": 4.149933065595716e-05,
449
- "loss": 0.0188,
450
  "step": 630
451
  },
452
  {
453
  "epoch": 0.7706201083684527,
454
- "grad_norm": 18.90018081665039,
455
  "learning_rate": 4.127621597501116e-05,
456
- "loss": 0.0424,
457
  "step": 640
458
  },
459
  {
460
  "epoch": 0.7826610475617098,
461
- "grad_norm": 0.7864706516265869,
462
  "learning_rate": 4.105310129406515e-05,
463
- "loss": 0.0379,
464
  "step": 650
465
  },
466
  {
467
  "epoch": 0.7947019867549668,
468
- "grad_norm": 14.94273567199707,
469
  "learning_rate": 4.082998661311915e-05,
470
- "loss": 0.0616,
471
  "step": 660
472
  },
473
  {
474
  "epoch": 0.8067429259482239,
475
- "grad_norm": 12.019325256347656,
476
  "learning_rate": 4.060687193217314e-05,
477
- "loss": 0.0393,
478
  "step": 670
479
  },
480
  {
481
  "epoch": 0.818783865141481,
482
- "grad_norm": 14.43171215057373,
483
  "learning_rate": 4.038375725122713e-05,
484
- "loss": 0.0341,
485
  "step": 680
486
  },
487
  {
488
  "epoch": 0.8308248043347382,
489
- "grad_norm": 1.1099140644073486,
490
  "learning_rate": 4.0160642570281125e-05,
491
- "loss": 0.0314,
492
  "step": 690
493
  },
494
  {
495
  "epoch": 0.8428657435279951,
496
- "grad_norm": 11.7412109375,
497
  "learning_rate": 3.993752788933512e-05,
498
- "loss": 0.0278,
499
  "step": 700
500
  },
501
  {
502
  "epoch": 0.8549066827212523,
503
- "grad_norm": 14.483551025390625,
504
  "learning_rate": 3.9714413208389115e-05,
505
- "loss": 0.0536,
506
  "step": 710
507
  },
508
  {
509
  "epoch": 0.8669476219145094,
510
- "grad_norm": 19.979427337646484,
511
  "learning_rate": 3.949129852744311e-05,
512
- "loss": 0.0403,
513
  "step": 720
514
  },
515
  {
516
  "epoch": 0.8789885611077664,
517
- "grad_norm": 7.5272040367126465,
518
  "learning_rate": 3.9268183846497105e-05,
519
- "loss": 0.0272,
520
  "step": 730
521
  },
522
  {
523
  "epoch": 0.8910295003010235,
524
- "grad_norm": 9.31237506866455,
525
  "learning_rate": 3.90450691655511e-05,
526
- "loss": 0.0269,
527
  "step": 740
528
  },
529
  {
530
  "epoch": 0.9030704394942806,
531
- "grad_norm": 32.383060455322266,
532
  "learning_rate": 3.882195448460509e-05,
533
- "loss": 0.0346,
534
  "step": 750
535
  },
536
  {
537
  "epoch": 0.9151113786875377,
538
- "grad_norm": 15.195170402526855,
539
  "learning_rate": 3.859883980365908e-05,
540
- "loss": 0.0501,
541
  "step": 760
542
  },
543
  {
544
  "epoch": 0.9271523178807947,
545
- "grad_norm": 12.330805778503418,
546
  "learning_rate": 3.837572512271307e-05,
547
- "loss": 0.0345,
548
  "step": 770
549
  },
550
  {
551
  "epoch": 0.9391932570740518,
552
- "grad_norm": 5.9623494148254395,
553
  "learning_rate": 3.815261044176707e-05,
554
- "loss": 0.0257,
555
  "step": 780
556
  },
557
  {
558
  "epoch": 0.9512341962673089,
559
- "grad_norm": 7.9370503425598145,
560
  "learning_rate": 3.792949576082106e-05,
561
- "loss": 0.0327,
562
  "step": 790
563
  },
564
  {
565
  "epoch": 0.963275135460566,
566
- "grad_norm": 4.536835193634033,
567
  "learning_rate": 3.770638107987506e-05,
568
- "loss": 0.022,
569
  "step": 800
570
  },
571
  {
572
  "epoch": 0.975316074653823,
573
- "grad_norm": 36.62160110473633,
574
  "learning_rate": 3.748326639892905e-05,
575
- "loss": 0.0331,
576
  "step": 810
577
  },
578
  {
579
  "epoch": 0.9873570138470801,
580
- "grad_norm": 34.91263961791992,
581
  "learning_rate": 3.7260151717983045e-05,
582
- "loss": 0.0538,
583
  "step": 820
584
  },
585
  {
586
  "epoch": 0.9993979530403372,
587
- "grad_norm": 33.354026794433594,
588
  "learning_rate": 3.7037037037037037e-05,
589
- "loss": 0.0421,
590
  "step": 830
591
  },
592
  {
593
  "epoch": 0.9993979530403372,
594
- "eval_f1": 0.9880010179847418,
595
- "eval_loss": 0.018475642427802086,
596
- "eval_runtime": 90.9858,
597
- "eval_samples_per_second": 129.8,
598
- "eval_steps_per_second": 4.067,
599
  "step": 830
600
  },
601
  {
602
  "epoch": 1.0117399157134257,
603
- "grad_norm": 2.2255642414093018,
604
  "learning_rate": 3.6813922356091035e-05,
605
- "loss": 0.0266,
606
  "step": 840
607
  },
608
  {
609
  "epoch": 1.0237808549066827,
610
- "grad_norm": 3.464303493499756,
611
  "learning_rate": 3.659080767514503e-05,
612
- "loss": 0.0172,
613
  "step": 850
614
  },
615
  {
616
  "epoch": 1.03582179409994,
617
- "grad_norm": 24.46825408935547,
618
  "learning_rate": 3.636769299419902e-05,
619
- "loss": 0.0336,
620
  "step": 860
621
  },
622
  {
623
  "epoch": 1.047862733293197,
624
- "grad_norm": 12.748779296875,
625
  "learning_rate": 3.614457831325301e-05,
626
- "loss": 0.0223,
627
  "step": 870
628
  },
629
  {
630
  "epoch": 1.059903672486454,
631
- "grad_norm": 12.276277542114258,
632
  "learning_rate": 3.592146363230701e-05,
633
- "loss": 0.0159,
634
  "step": 880
635
  },
636
  {
637
  "epoch": 1.0719446116797111,
638
- "grad_norm": 0.5621600151062012,
639
  "learning_rate": 3.5698348951361e-05,
640
- "loss": 0.0124,
641
  "step": 890
642
  },
643
  {
644
  "epoch": 1.083985550872968,
645
- "grad_norm": 13.014676094055176,
646
  "learning_rate": 3.5475234270415e-05,
647
- "loss": 0.0312,
648
  "step": 900
649
  },
650
  {
651
  "epoch": 1.096026490066225,
652
- "grad_norm": 10.154387474060059,
653
  "learning_rate": 3.525211958946899e-05,
654
- "loss": 0.0209,
655
  "step": 910
656
  },
657
  {
658
  "epoch": 1.1080674292594823,
659
- "grad_norm": 9.21349048614502,
660
  "learning_rate": 3.502900490852298e-05,
661
- "loss": 0.0232,
662
  "step": 920
663
  },
664
  {
665
  "epoch": 1.1201083684527393,
666
- "grad_norm": 18.21417808532715,
667
  "learning_rate": 3.4805890227576974e-05,
668
- "loss": 0.0197,
669
  "step": 930
670
  },
671
  {
672
  "epoch": 1.1321493076459963,
673
- "grad_norm": 4.2937846183776855,
674
  "learning_rate": 3.4582775546630966e-05,
675
- "loss": 0.0177,
676
  "step": 940
677
  },
678
  {
679
  "epoch": 1.1441902468392535,
680
- "grad_norm": 4.605610370635986,
681
  "learning_rate": 3.4359660865684965e-05,
682
- "loss": 0.0198,
683
  "step": 950
684
  },
685
  {
686
  "epoch": 1.1562311860325105,
687
- "grad_norm": 51.68130111694336,
688
  "learning_rate": 3.413654618473896e-05,
689
- "loss": 0.0257,
690
  "step": 960
691
  },
692
  {
693
  "epoch": 1.1682721252257675,
694
- "grad_norm": 27.81155014038086,
695
  "learning_rate": 3.3913431503792955e-05,
696
- "loss": 0.032,
697
  "step": 970
698
  },
699
  {
700
  "epoch": 1.1803130644190247,
701
- "grad_norm": 26.49490737915039,
702
  "learning_rate": 3.369031682284695e-05,
703
- "loss": 0.0372,
704
  "step": 980
705
  },
706
  {
707
  "epoch": 1.1923540036122817,
708
- "grad_norm": 32.12468719482422,
709
  "learning_rate": 3.346720214190094e-05,
710
- "loss": 0.0241,
711
  "step": 990
712
  },
713
  {
714
  "epoch": 1.2043949428055387,
715
- "grad_norm": 22.349802017211914,
716
  "learning_rate": 3.324408746095493e-05,
717
- "loss": 0.0429,
718
  "step": 1000
719
  },
720
  {
721
  "epoch": 1.216435881998796,
722
- "grad_norm": 12.724302291870117,
723
  "learning_rate": 3.302097278000892e-05,
724
- "loss": 0.0182,
725
  "step": 1010
726
  },
727
  {
728
  "epoch": 1.228476821192053,
729
- "grad_norm": 1.7184627056121826,
730
  "learning_rate": 3.279785809906292e-05,
731
- "loss": 0.0157,
732
  "step": 1020
733
  },
734
  {
735
  "epoch": 1.2405177603853101,
736
- "grad_norm": 5.114785194396973,
737
  "learning_rate": 3.257474341811691e-05,
738
- "loss": 0.0317,
739
  "step": 1030
740
  },
741
  {
742
  "epoch": 1.2525586995785671,
743
- "grad_norm": 12.009075164794922,
744
  "learning_rate": 3.235162873717091e-05,
745
- "loss": 0.0185,
746
  "step": 1040
747
  },
748
  {
749
  "epoch": 1.2645996387718241,
750
- "grad_norm": 1.8376182317733765,
751
  "learning_rate": 3.21285140562249e-05,
752
- "loss": 0.0268,
753
  "step": 1050
754
  },
755
  {
756
  "epoch": 1.2766405779650813,
757
- "grad_norm": 22.057708740234375,
758
  "learning_rate": 3.1905399375278894e-05,
759
- "loss": 0.0259,
760
  "step": 1060
761
  },
762
  {
763
  "epoch": 1.2886815171583383,
764
- "grad_norm": 20.427913665771484,
765
  "learning_rate": 3.1682284694332886e-05,
766
- "loss": 0.0177,
767
  "step": 1070
768
  },
769
  {
770
  "epoch": 1.3007224563515956,
771
- "grad_norm": 9.291601181030273,
772
  "learning_rate": 3.1459170013386885e-05,
773
- "loss": 0.0212,
774
  "step": 1080
775
  },
776
  {
777
  "epoch": 1.3127633955448526,
778
- "grad_norm": 5.983607769012451,
779
  "learning_rate": 3.1236055332440876e-05,
780
- "loss": 0.0193,
781
  "step": 1090
782
  },
783
  {
784
  "epoch": 1.3248043347381095,
785
- "grad_norm": 6.0939249992370605,
786
  "learning_rate": 3.101294065149487e-05,
787
- "loss": 0.0331,
788
  "step": 1100
789
  },
790
  {
791
  "epoch": 1.3368452739313668,
792
- "grad_norm": 16.53692054748535,
793
  "learning_rate": 3.078982597054887e-05,
794
- "loss": 0.0111,
795
  "step": 1110
796
  },
797
  {
798
  "epoch": 1.3488862131246238,
799
- "grad_norm": 0.17522521317005157,
800
  "learning_rate": 3.056671128960286e-05,
801
- "loss": 0.0194,
802
  "step": 1120
803
  },
804
  {
805
  "epoch": 1.3609271523178808,
806
- "grad_norm": 18.992000579833984,
807
  "learning_rate": 3.034359660865685e-05,
808
- "loss": 0.0216,
809
  "step": 1130
810
  },
811
  {
812
  "epoch": 1.372968091511138,
813
- "grad_norm": 0.417311429977417,
814
  "learning_rate": 3.012048192771085e-05,
815
- "loss": 0.0146,
816
  "step": 1140
817
  },
818
  {
819
  "epoch": 1.385009030704395,
820
- "grad_norm": 1.3428151607513428,
821
  "learning_rate": 2.989736724676484e-05,
822
- "loss": 0.0147,
823
  "step": 1150
824
  },
825
  {
826
  "epoch": 1.397049969897652,
827
- "grad_norm": 0.13748735189437866,
828
  "learning_rate": 2.9674252565818832e-05,
829
- "loss": 0.0177,
830
  "step": 1160
831
  },
832
  {
833
  "epoch": 1.4090909090909092,
834
- "grad_norm": 0.030834615230560303,
835
  "learning_rate": 2.9451137884872827e-05,
836
- "loss": 0.0164,
837
  "step": 1170
838
  },
839
  {
840
  "epoch": 1.4211318482841662,
841
- "grad_norm": 0.3782236576080322,
842
  "learning_rate": 2.922802320392682e-05,
843
- "loss": 0.0208,
844
  "step": 1180
845
  },
846
  {
847
  "epoch": 1.4331727874774232,
848
- "grad_norm": 0.44036567211151123,
849
  "learning_rate": 2.900490852298081e-05,
850
- "loss": 0.0251,
851
  "step": 1190
852
  },
853
  {
854
  "epoch": 1.4452137266706804,
855
- "grad_norm": 0.4725724458694458,
856
  "learning_rate": 2.878179384203481e-05,
857
- "loss": 0.0192,
858
  "step": 1200
859
  },
860
  {
861
  "epoch": 1.4572546658639374,
862
- "grad_norm": 5.114048480987549,
863
  "learning_rate": 2.85586791610888e-05,
864
- "loss": 0.0132,
865
  "step": 1210
866
  },
867
  {
868
  "epoch": 1.4692956050571944,
869
- "grad_norm": 17.11988067626953,
870
  "learning_rate": 2.8335564480142796e-05,
871
- "loss": 0.0159,
872
  "step": 1220
873
  },
874
  {
875
  "epoch": 1.4813365442504516,
876
- "grad_norm": 25.324487686157227,
877
  "learning_rate": 2.8112449799196788e-05,
878
- "loss": 0.0319,
879
  "step": 1230
880
  },
881
  {
882
  "epoch": 1.4933774834437086,
883
- "grad_norm": 2.2991812229156494,
884
  "learning_rate": 2.788933511825078e-05,
885
- "loss": 0.0088,
886
  "step": 1240
887
  },
888
  {
889
  "epoch": 1.5054184226369656,
890
- "grad_norm": 20.014575958251953,
891
  "learning_rate": 2.7666220437304775e-05,
892
- "loss": 0.0212,
893
  "step": 1250
894
  },
895
  {
896
  "epoch": 1.5174593618302228,
897
- "grad_norm": 5.483692169189453,
898
  "learning_rate": 2.7443105756358774e-05,
899
- "loss": 0.0083,
900
  "step": 1260
901
  },
902
  {
903
  "epoch": 1.5295003010234798,
904
- "grad_norm": 0.06430387496948242,
905
  "learning_rate": 2.7219991075412765e-05,
906
- "loss": 0.0367,
907
  "step": 1270
908
  },
909
  {
910
  "epoch": 1.5415412402167368,
911
- "grad_norm": 0.13903234899044037,
912
  "learning_rate": 2.6996876394466757e-05,
913
- "loss": 0.0103,
914
  "step": 1280
915
  },
916
  {
917
  "epoch": 1.553582179409994,
918
- "grad_norm": 0.16191639006137848,
919
  "learning_rate": 2.6773761713520752e-05,
920
- "loss": 0.0217,
921
  "step": 1290
922
  },
923
  {
924
  "epoch": 1.5656231186032512,
925
- "grad_norm": 29.6510066986084,
926
  "learning_rate": 2.6550647032574744e-05,
927
- "loss": 0.017,
928
  "step": 1300
929
  },
930
  {
931
  "epoch": 1.577664057796508,
932
- "grad_norm": 42.304893493652344,
933
  "learning_rate": 2.6327532351628736e-05,
934
- "loss": 0.0103,
935
  "step": 1310
936
  },
937
  {
938
  "epoch": 1.5897049969897652,
939
- "grad_norm": 15.645540237426758,
940
  "learning_rate": 2.6104417670682734e-05,
941
- "loss": 0.0111,
942
  "step": 1320
943
  },
944
  {
945
  "epoch": 1.6017459361830224,
946
- "grad_norm": 13.845064163208008,
947
  "learning_rate": 2.5881302989736726e-05,
948
- "loss": 0.0206,
949
  "step": 1330
950
  },
951
  {
952
  "epoch": 1.6137868753762792,
953
- "grad_norm": 14.391059875488281,
954
  "learning_rate": 2.565818830879072e-05,
955
- "loss": 0.016,
956
  "step": 1340
957
  },
958
  {
959
  "epoch": 1.6258278145695364,
960
- "grad_norm": 25.877519607543945,
961
  "learning_rate": 2.5435073627844713e-05,
962
- "loss": 0.028,
963
  "step": 1350
964
  },
965
  {
966
  "epoch": 1.6378687537627936,
967
- "grad_norm": 24.07805061340332,
968
  "learning_rate": 2.5211958946898705e-05,
969
- "loss": 0.0193,
970
  "step": 1360
971
  },
972
  {
973
  "epoch": 1.6499096929560506,
974
- "grad_norm": 14.133194923400879,
975
  "learning_rate": 2.4988844265952703e-05,
976
- "loss": 0.0199,
977
  "step": 1370
978
  },
979
  {
980
  "epoch": 1.6619506321493076,
981
- "grad_norm": 12.122308731079102,
982
  "learning_rate": 2.4765729585006695e-05,
983
- "loss": 0.0149,
984
  "step": 1380
985
  },
986
  {
987
  "epoch": 1.6739915713425648,
988
- "grad_norm": 0.21004052460193634,
989
  "learning_rate": 2.4542614904060687e-05,
990
- "loss": 0.0188,
991
  "step": 1390
992
  },
993
  {
994
  "epoch": 1.6860325105358218,
995
- "grad_norm": 33.458866119384766,
996
  "learning_rate": 2.4319500223114682e-05,
997
- "loss": 0.0246,
998
  "step": 1400
999
  },
1000
  {
1001
  "epoch": 1.6980734497290788,
1002
- "grad_norm": 7.240118503570557,
1003
  "learning_rate": 2.4096385542168677e-05,
1004
- "loss": 0.0235,
1005
  "step": 1410
1006
  },
1007
  {
1008
  "epoch": 1.710114388922336,
1009
- "grad_norm": 24.10188102722168,
1010
  "learning_rate": 2.387327086122267e-05,
1011
- "loss": 0.0285,
1012
  "step": 1420
1013
  },
1014
  {
1015
  "epoch": 1.722155328115593,
1016
- "grad_norm": 11.55083179473877,
1017
  "learning_rate": 2.3650156180276664e-05,
1018
- "loss": 0.0456,
1019
  "step": 1430
1020
  },
1021
  {
1022
  "epoch": 1.73419626730885,
1023
- "grad_norm": 18.611114501953125,
1024
  "learning_rate": 2.3427041499330656e-05,
1025
- "loss": 0.0177,
1026
  "step": 1440
1027
  },
1028
  {
1029
  "epoch": 1.7462372065021072,
1030
- "grad_norm": 1.5293560028076172,
1031
  "learning_rate": 2.320392681838465e-05,
1032
- "loss": 0.0364,
1033
  "step": 1450
1034
  },
1035
  {
1036
  "epoch": 1.7582781456953642,
1037
- "grad_norm": 10.18093490600586,
1038
  "learning_rate": 2.2980812137438646e-05,
1039
- "loss": 0.0188,
1040
  "step": 1460
1041
  },
1042
  {
1043
  "epoch": 1.7703190848886212,
1044
- "grad_norm": 14.831791877746582,
1045
  "learning_rate": 2.2757697456492638e-05,
1046
- "loss": 0.0197,
1047
  "step": 1470
1048
  },
1049
  {
1050
  "epoch": 1.7823600240818784,
1051
- "grad_norm": 2.1004889011383057,
1052
  "learning_rate": 2.253458277554663e-05,
1053
- "loss": 0.0127,
1054
  "step": 1480
1055
  },
1056
  {
1057
  "epoch": 1.7944009632751354,
1058
- "grad_norm": 0.04246840998530388,
1059
  "learning_rate": 2.2311468094600628e-05,
1060
- "loss": 0.0081,
1061
  "step": 1490
1062
  },
1063
  {
1064
  "epoch": 1.8064419024683924,
1065
- "grad_norm": 0.033497102558612823,
1066
  "learning_rate": 2.208835341365462e-05,
1067
- "loss": 0.0236,
1068
  "step": 1500
1069
  },
1070
  {
1071
  "epoch": 1.8184828416616496,
1072
- "grad_norm": 19.713680267333984,
1073
  "learning_rate": 2.186523873270861e-05,
1074
- "loss": 0.0205,
1075
  "step": 1510
1076
  },
1077
  {
1078
  "epoch": 1.8305237808549066,
1079
- "grad_norm": 32.84787368774414,
1080
  "learning_rate": 2.1642124051762607e-05,
1081
- "loss": 0.0187,
1082
  "step": 1520
1083
  },
1084
  {
1085
  "epoch": 1.8425647200481636,
1086
- "grad_norm": 8.59056282043457,
1087
  "learning_rate": 2.1419009370816602e-05,
1088
- "loss": 0.0176,
1089
  "step": 1530
1090
  },
1091
  {
1092
  "epoch": 1.8546056592414208,
1093
- "grad_norm": 14.714003562927246,
1094
  "learning_rate": 2.1195894689870593e-05,
1095
- "loss": 0.0187,
1096
  "step": 1540
1097
  },
1098
  {
1099
  "epoch": 1.866646598434678,
1100
- "grad_norm": 12.329898834228516,
1101
  "learning_rate": 2.097278000892459e-05,
1102
- "loss": 0.0078,
1103
  "step": 1550
1104
  },
1105
  {
1106
  "epoch": 1.8786875376279348,
1107
- "grad_norm": 10.909245491027832,
1108
  "learning_rate": 2.074966532797858e-05,
1109
- "loss": 0.0116,
1110
  "step": 1560
1111
  },
1112
  {
1113
  "epoch": 1.890728476821192,
1114
- "grad_norm": 3.1681225299835205,
1115
  "learning_rate": 2.0526550647032576e-05,
1116
- "loss": 0.0105,
1117
  "step": 1570
1118
  },
1119
  {
1120
  "epoch": 1.9027694160144493,
1121
- "grad_norm": 40.327972412109375,
1122
  "learning_rate": 2.030343596608657e-05,
1123
- "loss": 0.0302,
1124
  "step": 1580
1125
  },
1126
  {
1127
  "epoch": 1.914810355207706,
1128
- "grad_norm": 0.34792542457580566,
1129
  "learning_rate": 2.0080321285140562e-05,
1130
- "loss": 0.0054,
1131
  "step": 1590
1132
  },
1133
  {
1134
  "epoch": 1.9268512944009633,
1135
- "grad_norm": 15.200536727905273,
1136
  "learning_rate": 1.9857206604194558e-05,
1137
- "loss": 0.0133,
1138
  "step": 1600
1139
  },
1140
  {
1141
  "epoch": 1.9388922335942205,
1142
- "grad_norm": 21.62051773071289,
1143
  "learning_rate": 1.9634091923248553e-05,
1144
- "loss": 0.0116,
1145
  "step": 1610
1146
  },
1147
  {
1148
  "epoch": 1.9509331727874775,
1149
- "grad_norm": 69.98641204833984,
1150
  "learning_rate": 1.9410977242302544e-05,
1151
- "loss": 0.0112,
1152
  "step": 1620
1153
  },
1154
  {
1155
  "epoch": 1.9629741119807345,
1156
- "grad_norm": 4.132745742797852,
1157
  "learning_rate": 1.9187862561356536e-05,
1158
- "loss": 0.0239,
1159
  "step": 1630
1160
  },
1161
  {
1162
  "epoch": 1.9750150511739917,
1163
- "grad_norm": 37.902156829833984,
1164
  "learning_rate": 1.896474788041053e-05,
1165
- "loss": 0.0072,
1166
  "step": 1640
1167
  },
1168
  {
1169
  "epoch": 1.9870559903672487,
1170
- "grad_norm": 17.657148361206055,
1171
  "learning_rate": 1.8741633199464527e-05,
1172
- "loss": 0.0084,
1173
  "step": 1650
1174
  },
1175
  {
1176
  "epoch": 1.9990969295605057,
1177
- "grad_norm": 4.640130043029785,
1178
  "learning_rate": 1.8518518518518518e-05,
1179
- "loss": 0.0068,
1180
  "step": 1660
1181
  },
1182
  {
1183
  "epoch": 1.9990969295605057,
1184
- "eval_f1": 0.9919412107164509,
1185
- "eval_loss": 0.014574301429092884,
1186
- "eval_runtime": 90.4829,
1187
- "eval_samples_per_second": 130.522,
1188
- "eval_steps_per_second": 4.089,
1189
  "step": 1660
1190
  },
1191
  {
1192
  "epoch": 2.011438892233594,
1193
- "grad_norm": 0.18395759165287018,
1194
  "learning_rate": 1.8295403837572513e-05,
1195
- "loss": 0.0123,
1196
  "step": 1670
1197
  },
1198
  {
1199
  "epoch": 2.0234798314268514,
1200
- "grad_norm": 1.8568618297576904,
1201
  "learning_rate": 1.8072289156626505e-05,
1202
- "loss": 0.0132,
1203
  "step": 1680
1204
  },
1205
  {
1206
  "epoch": 2.035520770620108,
1207
- "grad_norm": 24.79265785217285,
1208
  "learning_rate": 1.78491744756805e-05,
1209
- "loss": 0.0125,
1210
  "step": 1690
1211
  },
1212
  {
1213
  "epoch": 2.0475617098133654,
1214
- "grad_norm": 27.074316024780273,
1215
  "learning_rate": 1.7626059794734495e-05,
1216
- "loss": 0.0087,
1217
  "step": 1700
1218
  },
1219
  {
1220
  "epoch": 2.0596026490066226,
1221
- "grad_norm": 72.42098999023438,
1222
  "learning_rate": 1.7402945113788487e-05,
1223
- "loss": 0.0219,
1224
  "step": 1710
1225
  },
1226
  {
1227
  "epoch": 2.07164358819988,
1228
- "grad_norm": 0.01518308836966753,
1229
  "learning_rate": 1.7179830432842482e-05,
1230
- "loss": 0.0159,
1231
  "step": 1720
1232
  },
1233
  {
1234
  "epoch": 2.0836845273931366,
1235
- "grad_norm": 0.050431251525878906,
1236
  "learning_rate": 1.6956715751896478e-05,
1237
- "loss": 0.0115,
1238
  "step": 1730
1239
  },
1240
  {
1241
  "epoch": 2.095725466586394,
1242
- "grad_norm": 16.709060668945312,
1243
  "learning_rate": 1.673360107095047e-05,
1244
- "loss": 0.0135,
1245
  "step": 1740
1246
  },
1247
  {
1248
  "epoch": 2.107766405779651,
1249
- "grad_norm": 0.12416893243789673,
1250
  "learning_rate": 1.651048639000446e-05,
1251
- "loss": 0.0047,
1252
  "step": 1750
1253
  },
1254
  {
1255
  "epoch": 2.119807344972908,
1256
- "grad_norm": 0.011000348255038261,
1257
  "learning_rate": 1.6287371709058456e-05,
1258
- "loss": 0.0034,
1259
  "step": 1760
1260
  },
1261
  {
1262
  "epoch": 2.131848284166165,
1263
- "grad_norm": 20.947795867919922,
1264
  "learning_rate": 1.606425702811245e-05,
1265
- "loss": 0.0209,
1266
  "step": 1770
1267
  },
1268
  {
1269
  "epoch": 2.1438892233594222,
1270
- "grad_norm": 33.35802459716797,
1271
  "learning_rate": 1.5841142347166443e-05,
1272
- "loss": 0.0031,
1273
  "step": 1780
1274
  },
1275
  {
1276
  "epoch": 2.155930162552679,
1277
- "grad_norm": 2.035996913909912,
1278
  "learning_rate": 1.5618027666220438e-05,
1279
- "loss": 0.0142,
1280
  "step": 1790
1281
  },
1282
  {
1283
  "epoch": 2.167971101745936,
1284
- "grad_norm": 0.2591819167137146,
1285
  "learning_rate": 1.5394912985274433e-05,
1286
- "loss": 0.0065,
1287
  "step": 1800
1288
  },
1289
  {
1290
  "epoch": 2.1800120409391934,
1291
- "grad_norm": 0.16437557339668274,
1292
  "learning_rate": 1.5171798304328425e-05,
1293
- "loss": 0.0074,
1294
  "step": 1810
1295
  },
1296
  {
1297
  "epoch": 2.19205298013245,
1298
- "grad_norm": 0.00624335091561079,
1299
  "learning_rate": 1.494868362338242e-05,
1300
- "loss": 0.0094,
1301
  "step": 1820
1302
  },
1303
  {
1304
  "epoch": 2.2040939193257074,
1305
- "grad_norm": 4.014737606048584,
1306
  "learning_rate": 1.4725568942436414e-05,
1307
- "loss": 0.0004,
1308
  "step": 1830
1309
  },
1310
  {
1311
  "epoch": 2.2161348585189646,
1312
- "grad_norm": 18.55063819885254,
1313
  "learning_rate": 1.4502454261490405e-05,
1314
- "loss": 0.0131,
1315
  "step": 1840
1316
  },
1317
  {
1318
  "epoch": 2.2281757977122214,
1319
- "grad_norm": 6.684368133544922,
1320
  "learning_rate": 1.42793395805444e-05,
1321
- "loss": 0.0055,
1322
  "step": 1850
1323
  },
1324
  {
1325
  "epoch": 2.2402167369054786,
1326
- "grad_norm": 47.78480529785156,
1327
  "learning_rate": 1.4056224899598394e-05,
1328
- "loss": 0.0064,
1329
  "step": 1860
1330
  },
1331
  {
1332
  "epoch": 2.252257676098736,
1333
- "grad_norm": 4.028109073638916,
1334
  "learning_rate": 1.3833110218652387e-05,
1335
- "loss": 0.0011,
1336
  "step": 1870
1337
  },
1338
  {
1339
  "epoch": 2.2642986152919926,
1340
- "grad_norm": 37.70731735229492,
1341
  "learning_rate": 1.3609995537706383e-05,
1342
- "loss": 0.012,
1343
  "step": 1880
1344
  },
1345
  {
1346
  "epoch": 2.27633955448525,
1347
- "grad_norm": 17.22875213623047,
1348
  "learning_rate": 1.3386880856760376e-05,
1349
- "loss": 0.0039,
1350
  "step": 1890
1351
  },
1352
  {
1353
  "epoch": 2.288380493678507,
1354
- "grad_norm": 0.056806787848472595,
1355
  "learning_rate": 1.3163766175814368e-05,
1356
- "loss": 0.0086,
1357
  "step": 1900
1358
  },
1359
  {
1360
  "epoch": 2.300421432871764,
1361
- "grad_norm": 3.934164524078369,
1362
  "learning_rate": 1.2940651494868363e-05,
1363
- "loss": 0.0136,
1364
  "step": 1910
1365
  },
1366
  {
1367
  "epoch": 2.312462372065021,
1368
- "grad_norm": 0.12872309982776642,
1369
  "learning_rate": 1.2717536813922356e-05,
1370
- "loss": 0.0021,
1371
  "step": 1920
1372
  },
1373
  {
1374
  "epoch": 2.3245033112582782,
1375
- "grad_norm": 44.9831657409668,
1376
  "learning_rate": 1.2494422132976352e-05,
1377
- "loss": 0.003,
1378
  "step": 1930
1379
  },
1380
  {
1381
  "epoch": 2.336544250451535,
1382
- "grad_norm": 0.02911054715514183,
1383
  "learning_rate": 1.2271307452030343e-05,
1384
- "loss": 0.0051,
1385
  "step": 1940
1386
  },
1387
  {
1388
  "epoch": 2.3485851896447922,
1389
- "grad_norm": 0.1190517470240593,
1390
  "learning_rate": 1.2048192771084338e-05,
1391
- "loss": 0.0129,
1392
  "step": 1950
1393
  },
1394
  {
1395
  "epoch": 2.3606261288380495,
1396
- "grad_norm": 1.1983438730239868,
1397
  "learning_rate": 1.1825078090138332e-05,
1398
- "loss": 0.0124,
1399
  "step": 1960
1400
  },
1401
  {
1402
  "epoch": 2.3726670680313067,
1403
- "grad_norm": 2.53009033203125,
1404
  "learning_rate": 1.1601963409192325e-05,
1405
- "loss": 0.0125,
1406
  "step": 1970
1407
  },
1408
  {
1409
  "epoch": 2.3847080072245634,
1410
- "grad_norm": 0.16323573887348175,
1411
  "learning_rate": 1.1378848728246319e-05,
1412
- "loss": 0.0024,
1413
  "step": 1980
1414
  },
1415
  {
1416
  "epoch": 2.3967489464178207,
1417
- "grad_norm": 0.08998429030179977,
1418
  "learning_rate": 1.1155734047300314e-05,
1419
- "loss": 0.0139,
1420
  "step": 1990
1421
  },
1422
  {
1423
  "epoch": 2.4087898856110774,
1424
- "grad_norm": 4.886857509613037,
1425
  "learning_rate": 1.0932619366354306e-05,
1426
- "loss": 0.0054,
1427
  "step": 2000
1428
  },
1429
  {
1430
  "epoch": 2.4208308248043346,
1431
- "grad_norm": 0.01201044674962759,
1432
  "learning_rate": 1.0709504685408301e-05,
1433
- "loss": 0.0035,
1434
  "step": 2010
1435
  },
1436
  {
1437
  "epoch": 2.432871763997592,
1438
- "grad_norm": 15.749720573425293,
1439
  "learning_rate": 1.0486390004462294e-05,
1440
- "loss": 0.0036,
1441
  "step": 2020
1442
  },
1443
  {
1444
  "epoch": 2.444912703190849,
1445
- "grad_norm": 1.3309760093688965,
1446
  "learning_rate": 1.0263275323516288e-05,
1447
- "loss": 0.0028,
1448
  "step": 2030
1449
  },
1450
  {
1451
  "epoch": 2.456953642384106,
1452
- "grad_norm": 0.008413532748818398,
1453
  "learning_rate": 1.0040160642570281e-05,
1454
- "loss": 0.0126,
1455
  "step": 2040
1456
  },
1457
  {
1458
  "epoch": 2.468994581577363,
1459
- "grad_norm": 2.204921245574951,
1460
  "learning_rate": 9.817045961624276e-06,
1461
- "loss": 0.004,
1462
  "step": 2050
1463
  },
1464
  {
1465
  "epoch": 2.4810355207706203,
1466
- "grad_norm": 0.04116813465952873,
1467
  "learning_rate": 9.593931280678268e-06,
1468
- "loss": 0.0029,
1469
  "step": 2060
1470
  },
1471
  {
1472
  "epoch": 2.493076459963877,
1473
- "grad_norm": 0.0023259413428604603,
1474
  "learning_rate": 9.370816599732263e-06,
1475
- "loss": 0.0048,
1476
  "step": 2070
1477
  },
1478
  {
1479
  "epoch": 2.5051173991571343,
1480
- "grad_norm": 0.21206362545490265,
1481
  "learning_rate": 9.147701918786257e-06,
1482
- "loss": 0.0022,
1483
  "step": 2080
1484
  },
1485
  {
1486
  "epoch": 2.5171583383503915,
1487
- "grad_norm": 6.095514297485352,
1488
  "learning_rate": 8.92458723784025e-06,
1489
- "loss": 0.0074,
1490
  "step": 2090
1491
  },
1492
  {
1493
  "epoch": 2.5291992775436483,
1494
- "grad_norm": 0.0026727153453975916,
1495
  "learning_rate": 8.701472556894244e-06,
1496
- "loss": 0.0059,
1497
  "step": 2100
1498
  },
1499
  {
1500
  "epoch": 2.5412402167369055,
1501
- "grad_norm": 1.8913568258285522,
1502
  "learning_rate": 8.478357875948239e-06,
1503
- "loss": 0.0007,
1504
  "step": 2110
1505
  },
1506
  {
1507
  "epoch": 2.5532811559301627,
1508
- "grad_norm": 0.06810165196657181,
1509
  "learning_rate": 8.25524319500223e-06,
1510
- "loss": 0.0025,
1511
  "step": 2120
1512
  },
1513
  {
1514
  "epoch": 2.5653220951234195,
1515
- "grad_norm": 1.2513346672058105,
1516
  "learning_rate": 8.032128514056226e-06,
1517
- "loss": 0.008,
1518
  "step": 2130
1519
  },
1520
  {
1521
  "epoch": 2.5773630343166767,
1522
- "grad_norm": 0.012058227322995663,
1523
  "learning_rate": 7.809013833110219e-06,
1524
- "loss": 0.0073,
1525
  "step": 2140
1526
  },
1527
  {
1528
  "epoch": 2.589403973509934,
1529
- "grad_norm": 0.011178981512784958,
1530
  "learning_rate": 7.5858991521642126e-06,
1531
- "loss": 0.0004,
1532
  "step": 2150
1533
  },
1534
  {
1535
  "epoch": 2.601444912703191,
1536
- "grad_norm": 0.003355270717293024,
1537
  "learning_rate": 7.362784471218207e-06,
1538
- "loss": 0.0051,
1539
  "step": 2160
1540
  },
1541
  {
1542
  "epoch": 2.613485851896448,
1543
- "grad_norm": 0.001846944447606802,
1544
  "learning_rate": 7.1396697902722e-06,
1545
- "loss": 0.0013,
1546
  "step": 2170
1547
  },
1548
  {
1549
  "epoch": 2.625526791089705,
1550
- "grad_norm": 0.003519382094964385,
1551
  "learning_rate": 6.916555109326194e-06,
1552
- "loss": 0.0067,
1553
  "step": 2180
1554
  },
1555
  {
1556
  "epoch": 2.637567730282962,
1557
- "grad_norm": 6.0739850997924805,
1558
  "learning_rate": 6.693440428380188e-06,
1559
- "loss": 0.0046,
1560
  "step": 2190
1561
  },
1562
  {
1563
  "epoch": 2.649608669476219,
1564
- "grad_norm": 11.507709503173828,
1565
  "learning_rate": 6.4703257474341815e-06,
1566
- "loss": 0.0033,
1567
  "step": 2200
1568
  },
1569
  {
1570
  "epoch": 2.6616496086694763,
1571
- "grad_norm": 0.13278013467788696,
1572
  "learning_rate": 6.247211066488176e-06,
1573
- "loss": 0.0048,
1574
  "step": 2210
1575
  },
1576
  {
1577
  "epoch": 2.6736905478627335,
1578
- "grad_norm": 0.018278079107403755,
1579
  "learning_rate": 6.024096385542169e-06,
1580
- "loss": 0.0073,
1581
  "step": 2220
1582
  },
1583
  {
1584
  "epoch": 2.6857314870559903,
1585
- "grad_norm": 0.2687268555164337,
1586
  "learning_rate": 5.800981704596163e-06,
1587
- "loss": 0.0002,
1588
  "step": 2230
1589
  },
1590
  {
1591
  "epoch": 2.6977724262492475,
1592
- "grad_norm": 6.959306716918945,
1593
  "learning_rate": 5.577867023650157e-06,
1594
- "loss": 0.0026,
1595
  "step": 2240
1596
  },
1597
  {
1598
  "epoch": 2.7098133654425043,
1599
- "grad_norm": 0.012783128768205643,
1600
  "learning_rate": 5.3547523427041504e-06,
1601
- "loss": 0.0,
1602
  "step": 2250
1603
  },
1604
  {
1605
  "epoch": 2.7218543046357615,
1606
- "grad_norm": 0.0021891624201089144,
1607
  "learning_rate": 5.131637661758144e-06,
1608
- "loss": 0.0011,
1609
  "step": 2260
1610
  },
1611
  {
1612
  "epoch": 2.7338952438290187,
1613
- "grad_norm": 0.6082541346549988,
1614
  "learning_rate": 4.908522980812138e-06,
1615
- "loss": 0.0003,
1616
  "step": 2270
1617
  },
1618
  {
1619
  "epoch": 2.745936183022276,
1620
- "grad_norm": 0.07555132359266281,
1621
  "learning_rate": 4.685408299866132e-06,
1622
- "loss": 0.0045,
1623
  "step": 2280
1624
  },
1625
  {
1626
  "epoch": 2.7579771222155327,
1627
- "grad_norm": 2.3299763202667236,
1628
  "learning_rate": 4.462293618920125e-06,
1629
- "loss": 0.004,
1630
  "step": 2290
1631
  },
1632
  {
1633
  "epoch": 2.77001806140879,
1634
- "grad_norm": 0.035113465040922165,
1635
  "learning_rate": 4.239178937974119e-06,
1636
- "loss": 0.0002,
1637
  "step": 2300
1638
  },
1639
  {
1640
  "epoch": 2.7820590006020467,
1641
- "grad_norm": 0.004046901594847441,
1642
  "learning_rate": 4.016064257028113e-06,
1643
- "loss": 0.0042,
1644
  "step": 2310
1645
  },
1646
  {
1647
  "epoch": 2.794099939795304,
1648
- "grad_norm": 0.9368377923965454,
1649
  "learning_rate": 3.7929495760821063e-06,
1650
- "loss": 0.0007,
1651
  "step": 2320
1652
  },
1653
  {
1654
  "epoch": 2.806140878988561,
1655
- "grad_norm": 0.4543118178844452,
1656
  "learning_rate": 3.5698348951361e-06,
1657
- "loss": 0.0058,
1658
  "step": 2330
1659
  },
1660
  {
1661
  "epoch": 2.8181818181818183,
1662
- "grad_norm": 0.005980230402201414,
1663
  "learning_rate": 3.346720214190094e-06,
1664
- "loss": 0.0115,
1665
  "step": 2340
1666
  },
1667
  {
1668
  "epoch": 2.830222757375075,
1669
- "grad_norm": 0.0012502376921474934,
1670
  "learning_rate": 3.123605533244088e-06,
1671
- "loss": 0.0,
1672
  "step": 2350
1673
  },
1674
  {
1675
  "epoch": 2.8422636965683323,
1676
- "grad_norm": 10.579268455505371,
1677
  "learning_rate": 2.9004908522980813e-06,
1678
- "loss": 0.0096,
1679
  "step": 2360
1680
  },
1681
  {
1682
  "epoch": 2.8543046357615895,
1683
- "grad_norm": 0.2079772800207138,
1684
  "learning_rate": 2.6773761713520752e-06,
1685
- "loss": 0.0099,
1686
  "step": 2370
1687
  },
1688
  {
1689
  "epoch": 2.8663455749548463,
1690
- "grad_norm": 29.779769897460938,
1691
  "learning_rate": 2.454261490406069e-06,
1692
- "loss": 0.0129,
1693
  "step": 2380
1694
  },
1695
  {
1696
  "epoch": 2.8783865141481035,
1697
- "grad_norm": 0.18259233236312866,
1698
  "learning_rate": 2.2311468094600625e-06,
1699
- "loss": 0.0041,
1700
  "step": 2390
1701
  },
1702
  {
1703
  "epoch": 2.8904274533413608,
1704
- "grad_norm": 0.00777797307819128,
1705
  "learning_rate": 2.0080321285140564e-06,
1706
- "loss": 0.0013,
1707
  "step": 2400
1708
  },
1709
  {
1710
  "epoch": 2.902468392534618,
1711
- "grad_norm": 0.001924191485159099,
1712
  "learning_rate": 1.78491744756805e-06,
1713
- "loss": 0.0001,
1714
  "step": 2410
1715
  },
1716
  {
1717
  "epoch": 2.9145093317278747,
1718
- "grad_norm": 0.019644932821393013,
1719
  "learning_rate": 1.561802766622044e-06,
1720
- "loss": 0.0,
1721
  "step": 2420
1722
  },
1723
  {
1724
  "epoch": 2.926550270921132,
1725
- "grad_norm": 0.09246612340211868,
1726
  "learning_rate": 1.3386880856760376e-06,
1727
- "loss": 0.0002,
1728
  "step": 2430
1729
  },
1730
  {
1731
  "epoch": 2.9385912101143887,
1732
- "grad_norm": 2.428246259689331,
1733
  "learning_rate": 1.1155734047300313e-06,
1734
- "loss": 0.0002,
1735
  "step": 2440
1736
  },
1737
  {
1738
  "epoch": 2.950632149307646,
1739
- "grad_norm": 23.632675170898438,
1740
  "learning_rate": 8.92458723784025e-07,
1741
- "loss": 0.0056,
1742
  "step": 2450
1743
  },
1744
  {
1745
  "epoch": 2.962673088500903,
1746
- "grad_norm": 20.292911529541016,
1747
  "learning_rate": 6.693440428380188e-07,
1748
- "loss": 0.0086,
1749
  "step": 2460
1750
  },
1751
  {
1752
  "epoch": 2.9747140276941604,
1753
- "grad_norm": 0.10044017434120178,
1754
  "learning_rate": 4.462293618920125e-07,
1755
- "loss": 0.0015,
1756
  "step": 2470
1757
  },
1758
  {
1759
  "epoch": 2.986754966887417,
1760
- "grad_norm": 0.31968560814857483,
1761
  "learning_rate": 2.2311468094600626e-07,
1762
- "loss": 0.0003,
1763
  "step": 2480
1764
  },
1765
  {
1766
  "epoch": 2.9987959060806744,
1767
- "grad_norm": 0.010867997072637081,
1768
  "learning_rate": 0.0,
1769
- "loss": 0.0002,
1770
  "step": 2490
1771
  },
1772
  {
1773
  "epoch": 2.9987959060806744,
1774
- "eval_f1": 0.9959466783459585,
1775
- "eval_loss": 0.00824679434299469,
1776
- "eval_runtime": 91.491,
1777
- "eval_samples_per_second": 129.084,
1778
- "eval_steps_per_second": 4.044,
1779
  "step": 2490
1780
  },
1781
  {
1782
  "epoch": 2.9987959060806744,
1783
  "step": 2490,
1784
  "total_flos": 2.4970885598061527e+19,
1785
- "train_loss": 0.046114599592564115,
1786
- "train_runtime": 4604.1373,
1787
- "train_samples_per_second": 69.254,
1788
- "train_steps_per_second": 0.541
1789
  }
1790
  ],
1791
  "logging_steps": 10,
 
1
  {
2
+ "best_metric": 0.9951696391786315,
3
  "best_model_checkpoint": "swin-base-patch4-window7-224-finetuned-eurosat/checkpoint-2490",
4
  "epoch": 2.9987959060806744,
5
  "eval_steps": 500,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.012040939193257074,
13
+ "grad_norm": 37.881980895996094,
14
  "learning_rate": 2.0080321285140564e-06,
15
+ "loss": 1.3851,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.024081878386514148,
20
+ "grad_norm": 27.445205688476562,
21
  "learning_rate": 4.016064257028113e-06,
22
+ "loss": 1.1863,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.036122817579771226,
27
+ "grad_norm": 26.590789794921875,
28
  "learning_rate": 6.024096385542169e-06,
29
+ "loss": 0.9271,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.048163756773028296,
34
+ "grad_norm": 24.892778396606445,
35
  "learning_rate": 8.032128514056226e-06,
36
+ "loss": 0.5864,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.060204695966285374,
41
+ "grad_norm": 28.44145393371582,
42
  "learning_rate": 1.0040160642570281e-05,
43
+ "loss": 0.3409,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.07224563515954245,
48
+ "grad_norm": 53.996341705322266,
49
  "learning_rate": 1.2048192771084338e-05,
50
+ "loss": 0.2325,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.08428657435279951,
55
+ "grad_norm": 52.281253814697266,
56
  "learning_rate": 1.4056224899598394e-05,
57
+ "loss": 0.1837,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.09632751354605659,
62
+ "grad_norm": 33.00680160522461,
63
  "learning_rate": 1.606425702811245e-05,
64
+ "loss": 0.1589,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.10836845273931367,
69
+ "grad_norm": 48.353519439697266,
70
  "learning_rate": 1.8072289156626505e-05,
71
+ "loss": 0.1678,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.12040939193257075,
76
+ "grad_norm": 40.386356353759766,
77
  "learning_rate": 2.0080321285140562e-05,
78
+ "loss": 0.1699,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.13245033112582782,
83
+ "grad_norm": 57.39128112792969,
84
  "learning_rate": 2.208835341365462e-05,
85
+ "loss": 0.1331,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.1444912703190849,
90
+ "grad_norm": 32.292842864990234,
91
  "learning_rate": 2.4096385542168677e-05,
92
+ "loss": 0.0939,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.15653220951234195,
97
+ "grad_norm": 57.7134895324707,
98
  "learning_rate": 2.6104417670682734e-05,
99
+ "loss": 0.1299,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.16857314870559903,
104
+ "grad_norm": 19.494935989379883,
105
  "learning_rate": 2.8112449799196788e-05,
106
+ "loss": 0.1485,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.1806140878988561,
111
+ "grad_norm": 34.56547164916992,
112
  "learning_rate": 3.012048192771085e-05,
113
+ "loss": 0.102,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.19265502709211318,
118
+ "grad_norm": 53.24101638793945,
119
  "learning_rate": 3.21285140562249e-05,
120
+ "loss": 0.0968,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.20469596628537026,
125
+ "grad_norm": 20.195816040039062,
126
  "learning_rate": 3.413654618473896e-05,
127
+ "loss": 0.1091,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.21673690547862734,
132
+ "grad_norm": 38.21851348876953,
133
  "learning_rate": 3.614457831325301e-05,
134
+ "loss": 0.0995,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.22877784467188442,
139
+ "grad_norm": 78.3773193359375,
140
  "learning_rate": 3.815261044176707e-05,
141
+ "loss": 0.0987,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.2408187838651415,
146
+ "grad_norm": 19.694732666015625,
147
  "learning_rate": 4.0160642570281125e-05,
148
+ "loss": 0.0757,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.25285972305839854,
153
+ "grad_norm": 22.578950881958008,
154
  "learning_rate": 4.2168674698795186e-05,
155
+ "loss": 0.0906,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.26490066225165565,
160
+ "grad_norm": 16.256385803222656,
161
  "learning_rate": 4.417670682730924e-05,
162
+ "loss": 0.072,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.2769416014449127,
167
+ "grad_norm": 19.03795623779297,
168
  "learning_rate": 4.61847389558233e-05,
169
+ "loss": 0.1194,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.2889825406381698,
174
+ "grad_norm": 28.470212936401367,
175
  "learning_rate": 4.8192771084337354e-05,
176
+ "loss": 0.1137,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.30102347983142685,
181
+ "grad_norm": 26.385528564453125,
182
  "learning_rate": 4.9977688531905406e-05,
183
+ "loss": 0.0885,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.3130644190246839,
188
+ "grad_norm": 13.934977531433105,
189
  "learning_rate": 4.97545738509594e-05,
190
+ "loss": 0.0767,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.325105358217941,
195
+ "grad_norm": 13.682855606079102,
196
  "learning_rate": 4.953145917001339e-05,
197
+ "loss": 0.0935,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.33714629741119806,
202
+ "grad_norm": 12.338207244873047,
203
  "learning_rate": 4.930834448906738e-05,
204
+ "loss": 0.0621,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.34918723660445516,
209
+ "grad_norm": 9.674701690673828,
210
  "learning_rate": 4.908522980812137e-05,
211
+ "loss": 0.1039,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.3612281757977122,
216
+ "grad_norm": 16.479585647583008,
217
  "learning_rate": 4.886211512717537e-05,
218
+ "loss": 0.0928,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.3732691149909693,
223
+ "grad_norm": 25.431562423706055,
224
  "learning_rate": 4.8639000446229364e-05,
225
+ "loss": 0.0737,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.38531005418422637,
230
+ "grad_norm": 6.355775833129883,
231
  "learning_rate": 4.8415885765283355e-05,
232
+ "loss": 0.081,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.3973509933774834,
237
+ "grad_norm": 13.13481616973877,
238
  "learning_rate": 4.8192771084337354e-05,
239
+ "loss": 0.0968,
240
  "step": 330
241
  },
242
  {
243
  "epoch": 0.4093919325707405,
244
+ "grad_norm": 5.214439868927002,
245
  "learning_rate": 4.7969656403391346e-05,
246
+ "loss": 0.0734,
247
  "step": 340
248
  },
249
  {
250
  "epoch": 0.4214328717639976,
251
+ "grad_norm": 17.911102294921875,
252
  "learning_rate": 4.774654172244534e-05,
253
+ "loss": 0.0323,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 0.4334738109572547,
258
+ "grad_norm": 20.677658081054688,
259
  "learning_rate": 4.7523427041499336e-05,
260
+ "loss": 0.0442,
261
  "step": 360
262
  },
263
  {
264
  "epoch": 0.44551475015051173,
265
+ "grad_norm": 21.959959030151367,
266
  "learning_rate": 4.730031236055333e-05,
267
+ "loss": 0.1071,
268
  "step": 370
269
  },
270
  {
271
  "epoch": 0.45755568934376883,
272
+ "grad_norm": 11.989217758178711,
273
  "learning_rate": 4.707719767960732e-05,
274
+ "loss": 0.1196,
275
  "step": 380
276
  },
277
  {
278
  "epoch": 0.4695966285370259,
279
+ "grad_norm": 18.601367950439453,
280
  "learning_rate": 4.685408299866131e-05,
281
+ "loss": 0.084,
282
  "step": 390
283
  },
284
  {
285
  "epoch": 0.481637567730283,
286
+ "grad_norm": 26.319194793701172,
287
  "learning_rate": 4.663096831771531e-05,
288
+ "loss": 0.1009,
289
  "step": 400
290
  },
291
  {
292
  "epoch": 0.49367850692354004,
293
+ "grad_norm": 16.73026466369629,
294
  "learning_rate": 4.64078536367693e-05,
295
+ "loss": 0.065,
296
  "step": 410
297
  },
298
  {
299
  "epoch": 0.5057194461167971,
300
+ "grad_norm": 6.364532947540283,
301
  "learning_rate": 4.61847389558233e-05,
302
+ "loss": 0.0411,
303
  "step": 420
304
  },
305
  {
306
  "epoch": 0.5177603853100542,
307
+ "grad_norm": 15.919637680053711,
308
  "learning_rate": 4.596162427487729e-05,
309
+ "loss": 0.0826,
310
  "step": 430
311
  },
312
  {
313
  "epoch": 0.5298013245033113,
314
+ "grad_norm": 9.566567420959473,
315
  "learning_rate": 4.5738509593931284e-05,
316
+ "loss": 0.0626,
317
  "step": 440
318
  },
319
  {
320
  "epoch": 0.5418422636965683,
321
+ "grad_norm": 15.061929702758789,
322
  "learning_rate": 4.5515394912985275e-05,
323
+ "loss": 0.0628,
324
  "step": 450
325
  },
326
  {
327
  "epoch": 0.5538832028898254,
328
+ "grad_norm": 9.935952186584473,
329
  "learning_rate": 4.529228023203927e-05,
330
+ "loss": 0.0522,
331
  "step": 460
332
  },
333
  {
334
  "epoch": 0.5659241420830825,
335
+ "grad_norm": 25.847078323364258,
336
  "learning_rate": 4.506916555109326e-05,
337
+ "loss": 0.0542,
338
  "step": 470
339
  },
340
  {
341
  "epoch": 0.5779650812763396,
342
+ "grad_norm": 26.931169509887695,
343
  "learning_rate": 4.484605087014726e-05,
344
+ "loss": 0.0678,
345
  "step": 480
346
  },
347
  {
348
  "epoch": 0.5900060204695966,
349
+ "grad_norm": 16.654766082763672,
350
  "learning_rate": 4.4622936189201256e-05,
351
+ "loss": 0.056,
352
  "step": 490
353
  },
354
  {
355
  "epoch": 0.6020469596628537,
356
+ "grad_norm": 28.53946304321289,
357
  "learning_rate": 4.439982150825525e-05,
358
+ "loss": 0.0477,
359
  "step": 500
360
  },
361
  {
362
  "epoch": 0.6140878988561108,
363
+ "grad_norm": 27.371204376220703,
364
  "learning_rate": 4.417670682730924e-05,
365
+ "loss": 0.0627,
366
  "step": 510
367
  },
368
  {
369
  "epoch": 0.6261288380493678,
370
+ "grad_norm": 10.891378402709961,
371
  "learning_rate": 4.395359214636323e-05,
372
+ "loss": 0.0435,
373
  "step": 520
374
  },
375
  {
376
  "epoch": 0.6381697772426249,
377
+ "grad_norm": 16.968677520751953,
378
  "learning_rate": 4.373047746541722e-05,
379
+ "loss": 0.0418,
380
  "step": 530
381
  },
382
  {
383
  "epoch": 0.650210716435882,
384
+ "grad_norm": 7.117012023925781,
385
  "learning_rate": 4.350736278447122e-05,
386
+ "loss": 0.035,
387
  "step": 540
388
  },
389
  {
390
  "epoch": 0.6622516556291391,
391
+ "grad_norm": 42.22456741333008,
392
  "learning_rate": 4.328424810352521e-05,
393
+ "loss": 0.0332,
394
  "step": 550
395
  },
396
  {
397
  "epoch": 0.6742925948223961,
398
+ "grad_norm": 8.816338539123535,
399
  "learning_rate": 4.306113342257921e-05,
400
+ "loss": 0.0632,
401
  "step": 560
402
  },
403
  {
404
  "epoch": 0.6863335340156532,
405
+ "grad_norm": 4.652446269989014,
406
  "learning_rate": 4.2838018741633203e-05,
407
+ "loss": 0.059,
408
  "step": 570
409
  },
410
  {
411
  "epoch": 0.6983744732089103,
412
+ "grad_norm": 14.168267250061035,
413
  "learning_rate": 4.2614904060687195e-05,
414
+ "loss": 0.0457,
415
  "step": 580
416
  },
417
  {
418
  "epoch": 0.7104154124021673,
419
+ "grad_norm": 21.56100845336914,
420
  "learning_rate": 4.239178937974119e-05,
421
+ "loss": 0.0339,
422
  "step": 590
423
  },
424
  {
425
  "epoch": 0.7224563515954244,
426
+ "grad_norm": 30.049415588378906,
427
  "learning_rate": 4.2168674698795186e-05,
428
+ "loss": 0.041,
429
  "step": 600
430
  },
431
  {
432
  "epoch": 0.7344972907886815,
433
+ "grad_norm": 16.741825103759766,
434
  "learning_rate": 4.194556001784918e-05,
435
+ "loss": 0.0468,
436
  "step": 610
437
  },
438
  {
439
  "epoch": 0.7465382299819386,
440
+ "grad_norm": 14.885356903076172,
441
  "learning_rate": 4.172244533690317e-05,
442
+ "loss": 0.0274,
443
  "step": 620
444
  },
445
  {
446
  "epoch": 0.7585791691751956,
447
+ "grad_norm": 14.38347339630127,
448
  "learning_rate": 4.149933065595716e-05,
449
+ "loss": 0.0429,
450
  "step": 630
451
  },
452
  {
453
  "epoch": 0.7706201083684527,
454
+ "grad_norm": 17.572885513305664,
455
  "learning_rate": 4.127621597501116e-05,
456
+ "loss": 0.0382,
457
  "step": 640
458
  },
459
  {
460
  "epoch": 0.7826610475617098,
461
+ "grad_norm": 6.489743709564209,
462
  "learning_rate": 4.105310129406515e-05,
463
+ "loss": 0.0268,
464
  "step": 650
465
  },
466
  {
467
  "epoch": 0.7947019867549668,
468
+ "grad_norm": 26.665414810180664,
469
  "learning_rate": 4.082998661311915e-05,
470
+ "loss": 0.0625,
471
  "step": 660
472
  },
473
  {
474
  "epoch": 0.8067429259482239,
475
+ "grad_norm": 18.689409255981445,
476
  "learning_rate": 4.060687193217314e-05,
477
+ "loss": 0.0445,
478
  "step": 670
479
  },
480
  {
481
  "epoch": 0.818783865141481,
482
+ "grad_norm": 9.611627578735352,
483
  "learning_rate": 4.038375725122713e-05,
484
+ "loss": 0.036,
485
  "step": 680
486
  },
487
  {
488
  "epoch": 0.8308248043347382,
489
+ "grad_norm": 23.417274475097656,
490
  "learning_rate": 4.0160642570281125e-05,
491
+ "loss": 0.0338,
492
  "step": 690
493
  },
494
  {
495
  "epoch": 0.8428657435279951,
496
+ "grad_norm": 10.954760551452637,
497
  "learning_rate": 3.993752788933512e-05,
498
+ "loss": 0.0369,
499
  "step": 700
500
  },
501
  {
502
  "epoch": 0.8549066827212523,
503
+ "grad_norm": 23.96639633178711,
504
  "learning_rate": 3.9714413208389115e-05,
505
+ "loss": 0.0531,
506
  "step": 710
507
  },
508
  {
509
  "epoch": 0.8669476219145094,
510
+ "grad_norm": 8.682661056518555,
511
  "learning_rate": 3.949129852744311e-05,
512
+ "loss": 0.0497,
513
  "step": 720
514
  },
515
  {
516
  "epoch": 0.8789885611077664,
517
+ "grad_norm": 4.494186878204346,
518
  "learning_rate": 3.9268183846497105e-05,
519
+ "loss": 0.0285,
520
  "step": 730
521
  },
522
  {
523
  "epoch": 0.8910295003010235,
524
+ "grad_norm": 15.755534172058105,
525
  "learning_rate": 3.90450691655511e-05,
526
+ "loss": 0.0322,
527
  "step": 740
528
  },
529
  {
530
  "epoch": 0.9030704394942806,
531
+ "grad_norm": 7.190816879272461,
532
  "learning_rate": 3.882195448460509e-05,
533
+ "loss": 0.0301,
534
  "step": 750
535
  },
536
  {
537
  "epoch": 0.9151113786875377,
538
+ "grad_norm": 0.7052440047264099,
539
  "learning_rate": 3.859883980365908e-05,
540
+ "loss": 0.0191,
541
  "step": 760
542
  },
543
  {
544
  "epoch": 0.9271523178807947,
545
+ "grad_norm": 10.254659652709961,
546
  "learning_rate": 3.837572512271307e-05,
547
+ "loss": 0.0333,
548
  "step": 770
549
  },
550
  {
551
  "epoch": 0.9391932570740518,
552
+ "grad_norm": 16.437780380249023,
553
  "learning_rate": 3.815261044176707e-05,
554
+ "loss": 0.0324,
555
  "step": 780
556
  },
557
  {
558
  "epoch": 0.9512341962673089,
559
+ "grad_norm": 17.565311431884766,
560
  "learning_rate": 3.792949576082106e-05,
561
+ "loss": 0.0318,
562
  "step": 790
563
  },
564
  {
565
  "epoch": 0.963275135460566,
566
+ "grad_norm": 9.585817337036133,
567
  "learning_rate": 3.770638107987506e-05,
568
+ "loss": 0.0322,
569
  "step": 800
570
  },
571
  {
572
  "epoch": 0.975316074653823,
573
+ "grad_norm": 20.597179412841797,
574
  "learning_rate": 3.748326639892905e-05,
575
+ "loss": 0.0342,
576
  "step": 810
577
  },
578
  {
579
  "epoch": 0.9873570138470801,
580
+ "grad_norm": 26.928421020507812,
581
  "learning_rate": 3.7260151717983045e-05,
582
+ "loss": 0.0388,
583
  "step": 820
584
  },
585
  {
586
  "epoch": 0.9993979530403372,
587
+ "grad_norm": 23.810352325439453,
588
  "learning_rate": 3.7037037037037037e-05,
589
+ "loss": 0.0376,
590
  "step": 830
591
  },
592
  {
593
  "epoch": 0.9993979530403372,
594
+ "eval_f1": 0.9759432907017198,
595
+ "eval_loss": 0.022286901250481606,
596
+ "eval_runtime": 124.0129,
597
+ "eval_samples_per_second": 95.232,
598
+ "eval_steps_per_second": 2.984,
599
  "step": 830
600
  },
601
  {
602
  "epoch": 1.0117399157134257,
603
+ "grad_norm": 10.749777793884277,
604
  "learning_rate": 3.6813922356091035e-05,
605
+ "loss": 0.0336,
606
  "step": 840
607
  },
608
  {
609
  "epoch": 1.0237808549066827,
610
+ "grad_norm": 9.403740882873535,
611
  "learning_rate": 3.659080767514503e-05,
612
+ "loss": 0.0179,
613
  "step": 850
614
  },
615
  {
616
  "epoch": 1.03582179409994,
617
+ "grad_norm": 3.2515432834625244,
618
  "learning_rate": 3.636769299419902e-05,
619
+ "loss": 0.0318,
620
  "step": 860
621
  },
622
  {
623
  "epoch": 1.047862733293197,
624
+ "grad_norm": 59.94658279418945,
625
  "learning_rate": 3.614457831325301e-05,
626
+ "loss": 0.0305,
627
  "step": 870
628
  },
629
  {
630
  "epoch": 1.059903672486454,
631
+ "grad_norm": 29.028059005737305,
632
  "learning_rate": 3.592146363230701e-05,
633
+ "loss": 0.0345,
634
  "step": 880
635
  },
636
  {
637
  "epoch": 1.0719446116797111,
638
+ "grad_norm": 19.99188232421875,
639
  "learning_rate": 3.5698348951361e-05,
640
+ "loss": 0.0305,
641
  "step": 890
642
  },
643
  {
644
  "epoch": 1.083985550872968,
645
+ "grad_norm": 10.841901779174805,
646
  "learning_rate": 3.5475234270415e-05,
647
+ "loss": 0.0283,
648
  "step": 900
649
  },
650
  {
651
  "epoch": 1.096026490066225,
652
+ "grad_norm": 23.107820510864258,
653
  "learning_rate": 3.525211958946899e-05,
654
+ "loss": 0.0195,
655
  "step": 910
656
  },
657
  {
658
  "epoch": 1.1080674292594823,
659
+ "grad_norm": 25.3140811920166,
660
  "learning_rate": 3.502900490852298e-05,
661
+ "loss": 0.0303,
662
  "step": 920
663
  },
664
  {
665
  "epoch": 1.1201083684527393,
666
+ "grad_norm": 24.250219345092773,
667
  "learning_rate": 3.4805890227576974e-05,
668
+ "loss": 0.0426,
669
  "step": 930
670
  },
671
  {
672
  "epoch": 1.1321493076459963,
673
+ "grad_norm": 6.2568793296813965,
674
  "learning_rate": 3.4582775546630966e-05,
675
+ "loss": 0.0289,
676
  "step": 940
677
  },
678
  {
679
  "epoch": 1.1441902468392535,
680
+ "grad_norm": 6.120177268981934,
681
  "learning_rate": 3.4359660865684965e-05,
682
+ "loss": 0.0469,
683
  "step": 950
684
  },
685
  {
686
  "epoch": 1.1562311860325105,
687
+ "grad_norm": 31.236495971679688,
688
  "learning_rate": 3.413654618473896e-05,
689
+ "loss": 0.0297,
690
  "step": 960
691
  },
692
  {
693
  "epoch": 1.1682721252257675,
694
+ "grad_norm": 9.599982261657715,
695
  "learning_rate": 3.3913431503792955e-05,
696
+ "loss": 0.0217,
697
  "step": 970
698
  },
699
  {
700
  "epoch": 1.1803130644190247,
701
+ "grad_norm": 20.92270278930664,
702
  "learning_rate": 3.369031682284695e-05,
703
+ "loss": 0.0325,
704
  "step": 980
705
  },
706
  {
707
  "epoch": 1.1923540036122817,
708
+ "grad_norm": 10.290868759155273,
709
  "learning_rate": 3.346720214190094e-05,
710
+ "loss": 0.0372,
711
  "step": 990
712
  },
713
  {
714
  "epoch": 1.2043949428055387,
715
+ "grad_norm": 17.62599754333496,
716
  "learning_rate": 3.324408746095493e-05,
717
+ "loss": 0.0305,
718
  "step": 1000
719
  },
720
  {
721
  "epoch": 1.216435881998796,
722
+ "grad_norm": 6.780280590057373,
723
  "learning_rate": 3.302097278000892e-05,
724
+ "loss": 0.0172,
725
  "step": 1010
726
  },
727
  {
728
  "epoch": 1.228476821192053,
729
+ "grad_norm": 16.38884735107422,
730
  "learning_rate": 3.279785809906292e-05,
731
+ "loss": 0.0249,
732
  "step": 1020
733
  },
734
  {
735
  "epoch": 1.2405177603853101,
736
+ "grad_norm": 16.210906982421875,
737
  "learning_rate": 3.257474341811691e-05,
738
+ "loss": 0.0436,
739
  "step": 1030
740
  },
741
  {
742
  "epoch": 1.2525586995785671,
743
+ "grad_norm": 12.718667984008789,
744
  "learning_rate": 3.235162873717091e-05,
745
+ "loss": 0.017,
746
  "step": 1040
747
  },
748
  {
749
  "epoch": 1.2645996387718241,
750
+ "grad_norm": 0.5069631338119507,
751
  "learning_rate": 3.21285140562249e-05,
752
+ "loss": 0.0358,
753
  "step": 1050
754
  },
755
  {
756
  "epoch": 1.2766405779650813,
757
+ "grad_norm": 8.307098388671875,
758
  "learning_rate": 3.1905399375278894e-05,
759
+ "loss": 0.011,
760
  "step": 1060
761
  },
762
  {
763
  "epoch": 1.2886815171583383,
764
+ "grad_norm": 7.443680763244629,
765
  "learning_rate": 3.1682284694332886e-05,
766
+ "loss": 0.0254,
767
  "step": 1070
768
  },
769
  {
770
  "epoch": 1.3007224563515956,
771
+ "grad_norm": 13.364812850952148,
772
  "learning_rate": 3.1459170013386885e-05,
773
+ "loss": 0.0164,
774
  "step": 1080
775
  },
776
  {
777
  "epoch": 1.3127633955448526,
778
+ "grad_norm": 17.30490493774414,
779
  "learning_rate": 3.1236055332440876e-05,
780
+ "loss": 0.0293,
781
  "step": 1090
782
  },
783
  {
784
  "epoch": 1.3248043347381095,
785
+ "grad_norm": 3.6283154487609863,
786
  "learning_rate": 3.101294065149487e-05,
787
+ "loss": 0.019,
788
  "step": 1100
789
  },
790
  {
791
  "epoch": 1.3368452739313668,
792
+ "grad_norm": 24.23577308654785,
793
  "learning_rate": 3.078982597054887e-05,
794
+ "loss": 0.0136,
795
  "step": 1110
796
  },
797
  {
798
  "epoch": 1.3488862131246238,
799
+ "grad_norm": 0.5705559849739075,
800
  "learning_rate": 3.056671128960286e-05,
801
+ "loss": 0.0282,
802
  "step": 1120
803
  },
804
  {
805
  "epoch": 1.3609271523178808,
806
+ "grad_norm": 25.361787796020508,
807
  "learning_rate": 3.034359660865685e-05,
808
+ "loss": 0.0272,
809
  "step": 1130
810
  },
811
  {
812
  "epoch": 1.372968091511138,
813
+ "grad_norm": 8.514543533325195,
814
  "learning_rate": 3.012048192771085e-05,
815
+ "loss": 0.0214,
816
  "step": 1140
817
  },
818
  {
819
  "epoch": 1.385009030704395,
820
+ "grad_norm": 25.069978713989258,
821
  "learning_rate": 2.989736724676484e-05,
822
+ "loss": 0.0237,
823
  "step": 1150
824
  },
825
  {
826
  "epoch": 1.397049969897652,
827
+ "grad_norm": 33.56332015991211,
828
  "learning_rate": 2.9674252565818832e-05,
829
+ "loss": 0.0139,
830
  "step": 1160
831
  },
832
  {
833
  "epoch": 1.4090909090909092,
834
+ "grad_norm": 6.231565952301025,
835
  "learning_rate": 2.9451137884872827e-05,
836
+ "loss": 0.0279,
837
  "step": 1170
838
  },
839
  {
840
  "epoch": 1.4211318482841662,
841
+ "grad_norm": 1.0254443883895874,
842
  "learning_rate": 2.922802320392682e-05,
843
+ "loss": 0.0242,
844
  "step": 1180
845
  },
846
  {
847
  "epoch": 1.4331727874774232,
848
+ "grad_norm": 2.424696445465088,
849
  "learning_rate": 2.900490852298081e-05,
850
+ "loss": 0.0215,
851
  "step": 1190
852
  },
853
  {
854
  "epoch": 1.4452137266706804,
855
+ "grad_norm": 27.256423950195312,
856
  "learning_rate": 2.878179384203481e-05,
857
+ "loss": 0.0501,
858
  "step": 1200
859
  },
860
  {
861
  "epoch": 1.4572546658639374,
862
+ "grad_norm": 10.172979354858398,
863
  "learning_rate": 2.85586791610888e-05,
864
+ "loss": 0.0362,
865
  "step": 1210
866
  },
867
  {
868
  "epoch": 1.4692956050571944,
869
+ "grad_norm": 9.41952896118164,
870
  "learning_rate": 2.8335564480142796e-05,
871
+ "loss": 0.027,
872
  "step": 1220
873
  },
874
  {
875
  "epoch": 1.4813365442504516,
876
+ "grad_norm": 35.477848052978516,
877
  "learning_rate": 2.8112449799196788e-05,
878
+ "loss": 0.0118,
879
  "step": 1230
880
  },
881
  {
882
  "epoch": 1.4933774834437086,
883
+ "grad_norm": 0.077400341629982,
884
  "learning_rate": 2.788933511825078e-05,
885
+ "loss": 0.0283,
886
  "step": 1240
887
  },
888
  {
889
  "epoch": 1.5054184226369656,
890
+ "grad_norm": 10.312284469604492,
891
  "learning_rate": 2.7666220437304775e-05,
892
+ "loss": 0.0248,
893
  "step": 1250
894
  },
895
  {
896
  "epoch": 1.5174593618302228,
897
+ "grad_norm": 33.70686340332031,
898
  "learning_rate": 2.7443105756358774e-05,
899
+ "loss": 0.0073,
900
  "step": 1260
901
  },
902
  {
903
  "epoch": 1.5295003010234798,
904
+ "grad_norm": 1.402552604675293,
905
  "learning_rate": 2.7219991075412765e-05,
906
+ "loss": 0.0515,
907
  "step": 1270
908
  },
909
  {
910
  "epoch": 1.5415412402167368,
911
+ "grad_norm": 32.386474609375,
912
  "learning_rate": 2.6996876394466757e-05,
913
+ "loss": 0.0142,
914
  "step": 1280
915
  },
916
  {
917
  "epoch": 1.553582179409994,
918
+ "grad_norm": 17.349578857421875,
919
  "learning_rate": 2.6773761713520752e-05,
920
+ "loss": 0.019,
921
  "step": 1290
922
  },
923
  {
924
  "epoch": 1.5656231186032512,
925
+ "grad_norm": 7.253089904785156,
926
  "learning_rate": 2.6550647032574744e-05,
927
+ "loss": 0.0201,
928
  "step": 1300
929
  },
930
  {
931
  "epoch": 1.577664057796508,
932
+ "grad_norm": 0.33720967173576355,
933
  "learning_rate": 2.6327532351628736e-05,
934
+ "loss": 0.0106,
935
  "step": 1310
936
  },
937
  {
938
  "epoch": 1.5897049969897652,
939
+ "grad_norm": 33.96757125854492,
940
  "learning_rate": 2.6104417670682734e-05,
941
+ "loss": 0.0357,
942
  "step": 1320
943
  },
944
  {
945
  "epoch": 1.6017459361830224,
946
+ "grad_norm": 14.57841682434082,
947
  "learning_rate": 2.5881302989736726e-05,
948
+ "loss": 0.0138,
949
  "step": 1330
950
  },
951
  {
952
  "epoch": 1.6137868753762792,
953
+ "grad_norm": 14.796563148498535,
954
  "learning_rate": 2.565818830879072e-05,
955
+ "loss": 0.011,
956
  "step": 1340
957
  },
958
  {
959
  "epoch": 1.6258278145695364,
960
+ "grad_norm": 25.40349578857422,
961
  "learning_rate": 2.5435073627844713e-05,
962
+ "loss": 0.0222,
963
  "step": 1350
964
  },
965
  {
966
  "epoch": 1.6378687537627936,
967
+ "grad_norm": 15.135784149169922,
968
  "learning_rate": 2.5211958946898705e-05,
969
+ "loss": 0.0238,
970
  "step": 1360
971
  },
972
  {
973
  "epoch": 1.6499096929560506,
974
+ "grad_norm": 14.570870399475098,
975
  "learning_rate": 2.4988844265952703e-05,
976
+ "loss": 0.0236,
977
  "step": 1370
978
  },
979
  {
980
  "epoch": 1.6619506321493076,
981
+ "grad_norm": 15.314672470092773,
982
  "learning_rate": 2.4765729585006695e-05,
983
+ "loss": 0.0265,
984
  "step": 1380
985
  },
986
  {
987
  "epoch": 1.6739915713425648,
988
+ "grad_norm": 4.963397026062012,
989
  "learning_rate": 2.4542614904060687e-05,
990
+ "loss": 0.0165,
991
  "step": 1390
992
  },
993
  {
994
  "epoch": 1.6860325105358218,
995
+ "grad_norm": 11.424381256103516,
996
  "learning_rate": 2.4319500223114682e-05,
997
+ "loss": 0.0244,
998
  "step": 1400
999
  },
1000
  {
1001
  "epoch": 1.6980734497290788,
1002
+ "grad_norm": 0.2027626931667328,
1003
  "learning_rate": 2.4096385542168677e-05,
1004
+ "loss": 0.0101,
1005
  "step": 1410
1006
  },
1007
  {
1008
  "epoch": 1.710114388922336,
1009
+ "grad_norm": 11.940728187561035,
1010
  "learning_rate": 2.387327086122267e-05,
1011
+ "loss": 0.0201,
1012
  "step": 1420
1013
  },
1014
  {
1015
  "epoch": 1.722155328115593,
1016
+ "grad_norm": 12.448373794555664,
1017
  "learning_rate": 2.3650156180276664e-05,
1018
+ "loss": 0.0425,
1019
  "step": 1430
1020
  },
1021
  {
1022
  "epoch": 1.73419626730885,
1023
+ "grad_norm": 15.968953132629395,
1024
  "learning_rate": 2.3427041499330656e-05,
1025
+ "loss": 0.0294,
1026
  "step": 1440
1027
  },
1028
  {
1029
  "epoch": 1.7462372065021072,
1030
+ "grad_norm": 0.5868140459060669,
1031
  "learning_rate": 2.320392681838465e-05,
1032
+ "loss": 0.0328,
1033
  "step": 1450
1034
  },
1035
  {
1036
  "epoch": 1.7582781456953642,
1037
+ "grad_norm": 18.026885986328125,
1038
  "learning_rate": 2.2980812137438646e-05,
1039
+ "loss": 0.0363,
1040
  "step": 1460
1041
  },
1042
  {
1043
  "epoch": 1.7703190848886212,
1044
+ "grad_norm": 26.996437072753906,
1045
  "learning_rate": 2.2757697456492638e-05,
1046
+ "loss": 0.032,
1047
  "step": 1470
1048
  },
1049
  {
1050
  "epoch": 1.7823600240818784,
1051
+ "grad_norm": 25.193265914916992,
1052
  "learning_rate": 2.253458277554663e-05,
1053
+ "loss": 0.0191,
1054
  "step": 1480
1055
  },
1056
  {
1057
  "epoch": 1.7944009632751354,
1058
+ "grad_norm": 1.650780439376831,
1059
  "learning_rate": 2.2311468094600628e-05,
1060
+ "loss": 0.0147,
1061
  "step": 1490
1062
  },
1063
  {
1064
  "epoch": 1.8064419024683924,
1065
+ "grad_norm": 0.1049940213561058,
1066
  "learning_rate": 2.208835341365462e-05,
1067
+ "loss": 0.0213,
1068
  "step": 1500
1069
  },
1070
  {
1071
  "epoch": 1.8184828416616496,
1072
+ "grad_norm": 1.9366893768310547,
1073
  "learning_rate": 2.186523873270861e-05,
1074
+ "loss": 0.0176,
1075
  "step": 1510
1076
  },
1077
  {
1078
  "epoch": 1.8305237808549066,
1079
+ "grad_norm": 0.09167797118425369,
1080
  "learning_rate": 2.1642124051762607e-05,
1081
+ "loss": 0.0227,
1082
  "step": 1520
1083
  },
1084
  {
1085
  "epoch": 1.8425647200481636,
1086
+ "grad_norm": 20.63637351989746,
1087
  "learning_rate": 2.1419009370816602e-05,
1088
+ "loss": 0.0199,
1089
  "step": 1530
1090
  },
1091
  {
1092
  "epoch": 1.8546056592414208,
1093
+ "grad_norm": 7.514628887176514,
1094
  "learning_rate": 2.1195894689870593e-05,
1095
+ "loss": 0.0159,
1096
  "step": 1540
1097
  },
1098
  {
1099
  "epoch": 1.866646598434678,
1100
+ "grad_norm": 27.5736141204834,
1101
  "learning_rate": 2.097278000892459e-05,
1102
+ "loss": 0.0189,
1103
  "step": 1550
1104
  },
1105
  {
1106
  "epoch": 1.8786875376279348,
1107
+ "grad_norm": 9.517942428588867,
1108
  "learning_rate": 2.074966532797858e-05,
1109
+ "loss": 0.0036,
1110
  "step": 1560
1111
  },
1112
  {
1113
  "epoch": 1.890728476821192,
1114
+ "grad_norm": 1.2054420709609985,
1115
  "learning_rate": 2.0526550647032576e-05,
1116
+ "loss": 0.0211,
1117
  "step": 1570
1118
  },
1119
  {
1120
  "epoch": 1.9027694160144493,
1121
+ "grad_norm": 3.337954521179199,
1122
  "learning_rate": 2.030343596608657e-05,
1123
+ "loss": 0.0207,
1124
  "step": 1580
1125
  },
1126
  {
1127
  "epoch": 1.914810355207706,
1128
+ "grad_norm": 10.905184745788574,
1129
  "learning_rate": 2.0080321285140562e-05,
1130
+ "loss": 0.0123,
1131
  "step": 1590
1132
  },
1133
  {
1134
  "epoch": 1.9268512944009633,
1135
+ "grad_norm": 7.964990615844727,
1136
  "learning_rate": 1.9857206604194558e-05,
1137
+ "loss": 0.0204,
1138
  "step": 1600
1139
  },
1140
  {
1141
  "epoch": 1.9388922335942205,
1142
+ "grad_norm": 11.229500770568848,
1143
  "learning_rate": 1.9634091923248553e-05,
1144
+ "loss": 0.0258,
1145
  "step": 1610
1146
  },
1147
  {
1148
  "epoch": 1.9509331727874775,
1149
+ "grad_norm": 21.939224243164062,
1150
  "learning_rate": 1.9410977242302544e-05,
1151
+ "loss": 0.0121,
1152
  "step": 1620
1153
  },
1154
  {
1155
  "epoch": 1.9629741119807345,
1156
+ "grad_norm": 3.842597723007202,
1157
  "learning_rate": 1.9187862561356536e-05,
1158
+ "loss": 0.0157,
1159
  "step": 1630
1160
  },
1161
  {
1162
  "epoch": 1.9750150511739917,
1163
+ "grad_norm": 17.12714958190918,
1164
  "learning_rate": 1.896474788041053e-05,
1165
+ "loss": 0.0159,
1166
  "step": 1640
1167
  },
1168
  {
1169
  "epoch": 1.9870559903672487,
1170
+ "grad_norm": 23.860105514526367,
1171
  "learning_rate": 1.8741633199464527e-05,
1172
+ "loss": 0.0069,
1173
  "step": 1650
1174
  },
1175
  {
1176
  "epoch": 1.9990969295605057,
1177
+ "grad_norm": 31.328365325927734,
1178
  "learning_rate": 1.8518518518518518e-05,
1179
+ "loss": 0.0088,
1180
  "step": 1660
1181
  },
1182
  {
1183
  "epoch": 1.9990969295605057,
1184
+ "eval_f1": 0.991992163050005,
1185
+ "eval_loss": 0.014776448719203472,
1186
+ "eval_runtime": 123.3802,
1187
+ "eval_samples_per_second": 95.72,
1188
+ "eval_steps_per_second": 2.999,
1189
  "step": 1660
1190
  },
1191
  {
1192
  "epoch": 2.011438892233594,
1193
+ "grad_norm": 20.75777244567871,
1194
  "learning_rate": 1.8295403837572513e-05,
1195
+ "loss": 0.0128,
1196
  "step": 1670
1197
  },
1198
  {
1199
  "epoch": 2.0234798314268514,
1200
+ "grad_norm": 0.30569037795066833,
1201
  "learning_rate": 1.8072289156626505e-05,
1202
+ "loss": 0.0044,
1203
  "step": 1680
1204
  },
1205
  {
1206
  "epoch": 2.035520770620108,
1207
+ "grad_norm": 0.06706225126981735,
1208
  "learning_rate": 1.78491744756805e-05,
1209
+ "loss": 0.0133,
1210
  "step": 1690
1211
  },
1212
  {
1213
  "epoch": 2.0475617098133654,
1214
+ "grad_norm": 38.09376907348633,
1215
  "learning_rate": 1.7626059794734495e-05,
1216
+ "loss": 0.0068,
1217
  "step": 1700
1218
  },
1219
  {
1220
  "epoch": 2.0596026490066226,
1221
+ "grad_norm": 26.19437599182129,
1222
  "learning_rate": 1.7402945113788487e-05,
1223
+ "loss": 0.0258,
1224
  "step": 1710
1225
  },
1226
  {
1227
  "epoch": 2.07164358819988,
1228
+ "grad_norm": 0.3012349307537079,
1229
  "learning_rate": 1.7179830432842482e-05,
1230
+ "loss": 0.0255,
1231
  "step": 1720
1232
  },
1233
  {
1234
  "epoch": 2.0836845273931366,
1235
+ "grad_norm": 0.3738686144351959,
1236
  "learning_rate": 1.6956715751896478e-05,
1237
+ "loss": 0.0299,
1238
  "step": 1730
1239
  },
1240
  {
1241
  "epoch": 2.095725466586394,
1242
+ "grad_norm": 15.113606452941895,
1243
  "learning_rate": 1.673360107095047e-05,
1244
+ "loss": 0.0159,
1245
  "step": 1740
1246
  },
1247
  {
1248
  "epoch": 2.107766405779651,
1249
+ "grad_norm": 0.10998114198446274,
1250
  "learning_rate": 1.651048639000446e-05,
1251
+ "loss": 0.0123,
1252
  "step": 1750
1253
  },
1254
  {
1255
  "epoch": 2.119807344972908,
1256
+ "grad_norm": 0.23346582055091858,
1257
  "learning_rate": 1.6287371709058456e-05,
1258
+ "loss": 0.0015,
1259
  "step": 1760
1260
  },
1261
  {
1262
  "epoch": 2.131848284166165,
1263
+ "grad_norm": 11.610380172729492,
1264
  "learning_rate": 1.606425702811245e-05,
1265
+ "loss": 0.0279,
1266
  "step": 1770
1267
  },
1268
  {
1269
  "epoch": 2.1438892233594222,
1270
+ "grad_norm": 40.500938415527344,
1271
  "learning_rate": 1.5841142347166443e-05,
1272
+ "loss": 0.0024,
1273
  "step": 1780
1274
  },
1275
  {
1276
  "epoch": 2.155930162552679,
1277
+ "grad_norm": 10.448357582092285,
1278
  "learning_rate": 1.5618027666220438e-05,
1279
+ "loss": 0.0319,
1280
  "step": 1790
1281
  },
1282
  {
1283
  "epoch": 2.167971101745936,
1284
+ "grad_norm": 33.20566940307617,
1285
  "learning_rate": 1.5394912985274433e-05,
1286
+ "loss": 0.0138,
1287
  "step": 1800
1288
  },
1289
  {
1290
  "epoch": 2.1800120409391934,
1291
+ "grad_norm": 1.2241170406341553,
1292
  "learning_rate": 1.5171798304328425e-05,
1293
+ "loss": 0.0021,
1294
  "step": 1810
1295
  },
1296
  {
1297
  "epoch": 2.19205298013245,
1298
+ "grad_norm": 3.8104119300842285,
1299
  "learning_rate": 1.494868362338242e-05,
1300
+ "loss": 0.005,
1301
  "step": 1820
1302
  },
1303
  {
1304
  "epoch": 2.2040939193257074,
1305
+ "grad_norm": 0.266283243894577,
1306
  "learning_rate": 1.4725568942436414e-05,
1307
+ "loss": 0.0073,
1308
  "step": 1830
1309
  },
1310
  {
1311
  "epoch": 2.2161348585189646,
1312
+ "grad_norm": 17.923351287841797,
1313
  "learning_rate": 1.4502454261490405e-05,
1314
+ "loss": 0.0164,
1315
  "step": 1840
1316
  },
1317
  {
1318
  "epoch": 2.2281757977122214,
1319
+ "grad_norm": 34.860801696777344,
1320
  "learning_rate": 1.42793395805444e-05,
1321
+ "loss": 0.0217,
1322
  "step": 1850
1323
  },
1324
  {
1325
  "epoch": 2.2402167369054786,
1326
+ "grad_norm": 13.830132484436035,
1327
  "learning_rate": 1.4056224899598394e-05,
1328
+ "loss": 0.0116,
1329
  "step": 1860
1330
  },
1331
  {
1332
  "epoch": 2.252257676098736,
1333
+ "grad_norm": 30.40813446044922,
1334
  "learning_rate": 1.3833110218652387e-05,
1335
+ "loss": 0.0145,
1336
  "step": 1870
1337
  },
1338
  {
1339
  "epoch": 2.2642986152919926,
1340
+ "grad_norm": 5.166720867156982,
1341
  "learning_rate": 1.3609995537706383e-05,
1342
+ "loss": 0.0114,
1343
  "step": 1880
1344
  },
1345
  {
1346
  "epoch": 2.27633955448525,
1347
+ "grad_norm": 0.14178332686424255,
1348
  "learning_rate": 1.3386880856760376e-05,
1349
+ "loss": 0.006,
1350
  "step": 1890
1351
  },
1352
  {
1353
  "epoch": 2.288380493678507,
1354
+ "grad_norm": 14.360862731933594,
1355
  "learning_rate": 1.3163766175814368e-05,
1356
+ "loss": 0.0196,
1357
  "step": 1900
1358
  },
1359
  {
1360
  "epoch": 2.300421432871764,
1361
+ "grad_norm": 36.61174392700195,
1362
  "learning_rate": 1.2940651494868363e-05,
1363
+ "loss": 0.0112,
1364
  "step": 1910
1365
  },
1366
  {
1367
  "epoch": 2.312462372065021,
1368
+ "grad_norm": 0.05367890000343323,
1369
  "learning_rate": 1.2717536813922356e-05,
1370
+ "loss": 0.0044,
1371
  "step": 1920
1372
  },
1373
  {
1374
  "epoch": 2.3245033112582782,
1375
+ "grad_norm": 5.995903968811035,
1376
  "learning_rate": 1.2494422132976352e-05,
1377
+ "loss": 0.0068,
1378
  "step": 1930
1379
  },
1380
  {
1381
  "epoch": 2.336544250451535,
1382
+ "grad_norm": 0.1826033890247345,
1383
  "learning_rate": 1.2271307452030343e-05,
1384
+ "loss": 0.0037,
1385
  "step": 1940
1386
  },
1387
  {
1388
  "epoch": 2.3485851896447922,
1389
+ "grad_norm": 0.022175664082169533,
1390
  "learning_rate": 1.2048192771084338e-05,
1391
+ "loss": 0.0039,
1392
  "step": 1950
1393
  },
1394
  {
1395
  "epoch": 2.3606261288380495,
1396
+ "grad_norm": 0.009515208192169666,
1397
  "learning_rate": 1.1825078090138332e-05,
1398
+ "loss": 0.0174,
1399
  "step": 1960
1400
  },
1401
  {
1402
  "epoch": 2.3726670680313067,
1403
+ "grad_norm": 1.4253636598587036,
1404
  "learning_rate": 1.1601963409192325e-05,
1405
+ "loss": 0.0172,
1406
  "step": 1970
1407
  },
1408
  {
1409
  "epoch": 2.3847080072245634,
1410
+ "grad_norm": 0.5044592022895813,
1411
  "learning_rate": 1.1378848728246319e-05,
1412
+ "loss": 0.0136,
1413
  "step": 1980
1414
  },
1415
  {
1416
  "epoch": 2.3967489464178207,
1417
+ "grad_norm": 0.10618968307971954,
1418
  "learning_rate": 1.1155734047300314e-05,
1419
+ "loss": 0.0134,
1420
  "step": 1990
1421
  },
1422
  {
1423
  "epoch": 2.4087898856110774,
1424
+ "grad_norm": 0.37878698110580444,
1425
  "learning_rate": 1.0932619366354306e-05,
1426
+ "loss": 0.0077,
1427
  "step": 2000
1428
  },
1429
  {
1430
  "epoch": 2.4208308248043346,
1431
+ "grad_norm": 0.15668153762817383,
1432
  "learning_rate": 1.0709504685408301e-05,
1433
+ "loss": 0.0045,
1434
  "step": 2010
1435
  },
1436
  {
1437
  "epoch": 2.432871763997592,
1438
+ "grad_norm": 43.199405670166016,
1439
  "learning_rate": 1.0486390004462294e-05,
1440
+ "loss": 0.0107,
1441
  "step": 2020
1442
  },
1443
  {
1444
  "epoch": 2.444912703190849,
1445
+ "grad_norm": 25.30035972595215,
1446
  "learning_rate": 1.0263275323516288e-05,
1447
+ "loss": 0.0067,
1448
  "step": 2030
1449
  },
1450
  {
1451
  "epoch": 2.456953642384106,
1452
+ "grad_norm": 0.006626456510275602,
1453
  "learning_rate": 1.0040160642570281e-05,
1454
+ "loss": 0.0122,
1455
  "step": 2040
1456
  },
1457
  {
1458
  "epoch": 2.468994581577363,
1459
+ "grad_norm": 6.633700847625732,
1460
  "learning_rate": 9.817045961624276e-06,
1461
+ "loss": 0.0094,
1462
  "step": 2050
1463
  },
1464
  {
1465
  "epoch": 2.4810355207706203,
1466
+ "grad_norm": 0.006737573072314262,
1467
  "learning_rate": 9.593931280678268e-06,
1468
+ "loss": 0.0069,
1469
  "step": 2060
1470
  },
1471
  {
1472
  "epoch": 2.493076459963877,
1473
+ "grad_norm": 0.12591099739074707,
1474
  "learning_rate": 9.370816599732263e-06,
1475
+ "loss": 0.0134,
1476
  "step": 2070
1477
  },
1478
  {
1479
  "epoch": 2.5051173991571343,
1480
+ "grad_norm": 0.9642283916473389,
1481
  "learning_rate": 9.147701918786257e-06,
1482
+ "loss": 0.0047,
1483
  "step": 2080
1484
  },
1485
  {
1486
  "epoch": 2.5171583383503915,
1487
+ "grad_norm": 17.466915130615234,
1488
  "learning_rate": 8.92458723784025e-06,
1489
+ "loss": 0.0136,
1490
  "step": 2090
1491
  },
1492
  {
1493
  "epoch": 2.5291992775436483,
1494
+ "grad_norm": 0.034244559705257416,
1495
  "learning_rate": 8.701472556894244e-06,
1496
+ "loss": 0.0111,
1497
  "step": 2100
1498
  },
1499
  {
1500
  "epoch": 2.5412402167369055,
1501
+ "grad_norm": 0.06431080400943756,
1502
  "learning_rate": 8.478357875948239e-06,
1503
+ "loss": 0.0027,
1504
  "step": 2110
1505
  },
1506
  {
1507
  "epoch": 2.5532811559301627,
1508
+ "grad_norm": 0.09189895540475845,
1509
  "learning_rate": 8.25524319500223e-06,
1510
+ "loss": 0.0153,
1511
  "step": 2120
1512
  },
1513
  {
1514
  "epoch": 2.5653220951234195,
1515
+ "grad_norm": 0.2820568084716797,
1516
  "learning_rate": 8.032128514056226e-06,
1517
+ "loss": 0.0082,
1518
  "step": 2130
1519
  },
1520
  {
1521
  "epoch": 2.5773630343166767,
1522
+ "grad_norm": 0.28683242201805115,
1523
  "learning_rate": 7.809013833110219e-06,
1524
+ "loss": 0.0069,
1525
  "step": 2140
1526
  },
1527
  {
1528
  "epoch": 2.589403973509934,
1529
+ "grad_norm": 0.005995332263410091,
1530
  "learning_rate": 7.5858991521642126e-06,
1531
+ "loss": 0.003,
1532
  "step": 2150
1533
  },
1534
  {
1535
  "epoch": 2.601444912703191,
1536
+ "grad_norm": 0.008528484962880611,
1537
  "learning_rate": 7.362784471218207e-06,
1538
+ "loss": 0.009,
1539
  "step": 2160
1540
  },
1541
  {
1542
  "epoch": 2.613485851896448,
1543
+ "grad_norm": 0.0026729849632829428,
1544
  "learning_rate": 7.1396697902722e-06,
1545
+ "loss": 0.002,
1546
  "step": 2170
1547
  },
1548
  {
1549
  "epoch": 2.625526791089705,
1550
+ "grad_norm": 0.052899375557899475,
1551
  "learning_rate": 6.916555109326194e-06,
1552
+ "loss": 0.0129,
1553
  "step": 2180
1554
  },
1555
  {
1556
  "epoch": 2.637567730282962,
1557
+ "grad_norm": 4.501436710357666,
1558
  "learning_rate": 6.693440428380188e-06,
1559
+ "loss": 0.0124,
1560
  "step": 2190
1561
  },
1562
  {
1563
  "epoch": 2.649608669476219,
1564
+ "grad_norm": 7.804245948791504,
1565
  "learning_rate": 6.4703257474341815e-06,
1566
+ "loss": 0.0037,
1567
  "step": 2200
1568
  },
1569
  {
1570
  "epoch": 2.6616496086694763,
1571
+ "grad_norm": 0.03340630233287811,
1572
  "learning_rate": 6.247211066488176e-06,
1573
+ "loss": 0.0064,
1574
  "step": 2210
1575
  },
1576
  {
1577
  "epoch": 2.6736905478627335,
1578
+ "grad_norm": 0.016790101304650307,
1579
  "learning_rate": 6.024096385542169e-06,
1580
+ "loss": 0.0124,
1581
  "step": 2220
1582
  },
1583
  {
1584
  "epoch": 2.6857314870559903,
1585
+ "grad_norm": 0.05290292948484421,
1586
  "learning_rate": 5.800981704596163e-06,
1587
+ "loss": 0.0023,
1588
  "step": 2230
1589
  },
1590
  {
1591
  "epoch": 2.6977724262492475,
1592
+ "grad_norm": 7.013312816619873,
1593
  "learning_rate": 5.577867023650157e-06,
1594
+ "loss": 0.0093,
1595
  "step": 2240
1596
  },
1597
  {
1598
  "epoch": 2.7098133654425043,
1599
+ "grad_norm": 5.808313846588135,
1600
  "learning_rate": 5.3547523427041504e-06,
1601
+ "loss": 0.0025,
1602
  "step": 2250
1603
  },
1604
  {
1605
  "epoch": 2.7218543046357615,
1606
+ "grad_norm": 0.07213304936885834,
1607
  "learning_rate": 5.131637661758144e-06,
1608
+ "loss": 0.0036,
1609
  "step": 2260
1610
  },
1611
  {
1612
  "epoch": 2.7338952438290187,
1613
+ "grad_norm": 0.8108634352684021,
1614
  "learning_rate": 4.908522980812138e-06,
1615
+ "loss": 0.0019,
1616
  "step": 2270
1617
  },
1618
  {
1619
  "epoch": 2.745936183022276,
1620
+ "grad_norm": 0.007346575614064932,
1621
  "learning_rate": 4.685408299866132e-06,
1622
+ "loss": 0.0039,
1623
  "step": 2280
1624
  },
1625
  {
1626
  "epoch": 2.7579771222155327,
1627
+ "grad_norm": 12.846696853637695,
1628
  "learning_rate": 4.462293618920125e-06,
1629
+ "loss": 0.0203,
1630
  "step": 2290
1631
  },
1632
  {
1633
  "epoch": 2.77001806140879,
1634
+ "grad_norm": 1.8753130435943604,
1635
  "learning_rate": 4.239178937974119e-06,
1636
+ "loss": 0.0004,
1637
  "step": 2300
1638
  },
1639
  {
1640
  "epoch": 2.7820590006020467,
1641
+ "grad_norm": 0.018001612275838852,
1642
  "learning_rate": 4.016064257028113e-06,
1643
+ "loss": 0.0054,
1644
  "step": 2310
1645
  },
1646
  {
1647
  "epoch": 2.794099939795304,
1648
+ "grad_norm": 1.2618842124938965,
1649
  "learning_rate": 3.7929495760821063e-06,
1650
+ "loss": 0.0005,
1651
  "step": 2320
1652
  },
1653
  {
1654
  "epoch": 2.806140878988561,
1655
+ "grad_norm": 0.04195760190486908,
1656
  "learning_rate": 3.5698348951361e-06,
1657
+ "loss": 0.0164,
1658
  "step": 2330
1659
  },
1660
  {
1661
  "epoch": 2.8181818181818183,
1662
+ "grad_norm": 0.20935901999473572,
1663
  "learning_rate": 3.346720214190094e-06,
1664
+ "loss": 0.0121,
1665
  "step": 2340
1666
  },
1667
  {
1668
  "epoch": 2.830222757375075,
1669
+ "grad_norm": 0.14867332577705383,
1670
  "learning_rate": 3.123605533244088e-06,
1671
+ "loss": 0.0002,
1672
  "step": 2350
1673
  },
1674
  {
1675
  "epoch": 2.8422636965683323,
1676
+ "grad_norm": 12.636212348937988,
1677
  "learning_rate": 2.9004908522980813e-06,
1678
+ "loss": 0.0059,
1679
  "step": 2360
1680
  },
1681
  {
1682
  "epoch": 2.8543046357615895,
1683
+ "grad_norm": 0.08282257616519928,
1684
  "learning_rate": 2.6773761713520752e-06,
1685
+ "loss": 0.0072,
1686
  "step": 2370
1687
  },
1688
  {
1689
  "epoch": 2.8663455749548463,
1690
+ "grad_norm": 37.93838882446289,
1691
  "learning_rate": 2.454261490406069e-06,
1692
+ "loss": 0.0139,
1693
  "step": 2380
1694
  },
1695
  {
1696
  "epoch": 2.8783865141481035,
1697
+ "grad_norm": 0.531898558139801,
1698
  "learning_rate": 2.2311468094600625e-06,
1699
+ "loss": 0.0065,
1700
  "step": 2390
1701
  },
1702
  {
1703
  "epoch": 2.8904274533413608,
1704
+ "grad_norm": 17.711135864257812,
1705
  "learning_rate": 2.0080321285140564e-06,
1706
+ "loss": 0.007,
1707
  "step": 2400
1708
  },
1709
  {
1710
  "epoch": 2.902468392534618,
1711
+ "grad_norm": 0.04107944294810295,
1712
  "learning_rate": 1.78491744756805e-06,
1713
+ "loss": 0.0041,
1714
  "step": 2410
1715
  },
1716
  {
1717
  "epoch": 2.9145093317278747,
1718
+ "grad_norm": 0.00876621063798666,
1719
  "learning_rate": 1.561802766622044e-06,
1720
+ "loss": 0.0098,
1721
  "step": 2420
1722
  },
1723
  {
1724
  "epoch": 2.926550270921132,
1725
+ "grad_norm": 0.09590450674295425,
1726
  "learning_rate": 1.3386880856760376e-06,
1727
+ "loss": 0.0004,
1728
  "step": 2430
1729
  },
1730
  {
1731
  "epoch": 2.9385912101143887,
1732
+ "grad_norm": 0.335151344537735,
1733
  "learning_rate": 1.1155734047300313e-06,
1734
+ "loss": 0.002,
1735
  "step": 2440
1736
  },
1737
  {
1738
  "epoch": 2.950632149307646,
1739
+ "grad_norm": 17.89304542541504,
1740
  "learning_rate": 8.92458723784025e-07,
1741
+ "loss": 0.0038,
1742
  "step": 2450
1743
  },
1744
  {
1745
  "epoch": 2.962673088500903,
1746
+ "grad_norm": 7.468949794769287,
1747
  "learning_rate": 6.693440428380188e-07,
1748
+ "loss": 0.0054,
1749
  "step": 2460
1750
  },
1751
  {
1752
  "epoch": 2.9747140276941604,
1753
+ "grad_norm": 1.7089877128601074,
1754
  "learning_rate": 4.462293618920125e-07,
1755
+ "loss": 0.0044,
1756
  "step": 2470
1757
  },
1758
  {
1759
  "epoch": 2.986754966887417,
1760
+ "grad_norm": 0.03323278948664665,
1761
  "learning_rate": 2.2311468094600626e-07,
1762
+ "loss": 0.0097,
1763
  "step": 2480
1764
  },
1765
  {
1766
  "epoch": 2.9987959060806744,
1767
+ "grad_norm": 0.1638062298297882,
1768
  "learning_rate": 0.0,
1769
+ "loss": 0.0042,
1770
  "step": 2490
1771
  },
1772
  {
1773
  "epoch": 2.9987959060806744,
1774
+ "eval_f1": 0.9951696391786315,
1775
+ "eval_loss": 0.009932301938533783,
1776
+ "eval_runtime": 123.5306,
1777
+ "eval_samples_per_second": 95.604,
1778
+ "eval_steps_per_second": 2.995,
1779
  "step": 2490
1780
  },
1781
  {
1782
  "epoch": 2.9987959060806744,
1783
  "step": 2490,
1784
  "total_flos": 2.4970885598061527e+19,
1785
+ "train_loss": 0.05150821726141782,
1786
+ "train_runtime": 5585.3207,
1787
+ "train_samples_per_second": 57.088,
1788
+ "train_steps_per_second": 0.446
1789
  }
1790
  ],
1791
  "logging_steps": 10,