souging commited on
Commit
812c297
·
verified ·
1 Parent(s): 5a32254

Training in progress, step 300, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:142fddf7597ba8af566a65dd7556c5175782b026760e9e645b83aab257655746
3
  size 328603864
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20bafa183fbf3bbe9c9905563495213e38cef9ee3c7cd345adcb253be884cd6f
3
  size 328603864
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f3f697b4b654fbc5577d476540fba027f5e83c6e274c7aa397d35a87934fe777
3
  size 657550198
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42362df9ec696d3d0320f768a0bdb5b397ef5004355b755028fc2eecf05c932a
3
  size 657550198
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:51cd36ad2605ec4cab21e475c1ac7fb8b347fb6abdcfe5bc4f63aecf3d71d8ce
3
  size 16048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0eaf4305d2c5e42a4f55f7afbe09af37fe7e4788ba422b48e04415bf88585bc
3
  size 16048
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:261bbc763adac7d03c1a082867309667f8fb83fc7a1b84c5402b6347df42fe46
3
  size 16048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc8fb32fe9b2c01dd6e4f18169bd171991f967f5fa0ffc420d77cc0db6efb24a
3
  size 16048
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1c623db235bb71bffb776e0eab3b558456f134b1b73107cd8c23b1c40ced5689
3
  size 16048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a3f73d17ae338d8cac6481951019e365941b252022e7d0243a7fc8772314e59
3
  size 16048
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a2cd5bc26454d617fd60fe08010ba4a195c8e3a7f72f12cd3a767d4186b02c70
3
  size 16048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d88dd21f7a8c6a81cd6b8f8e5c08a387d55c3de0e6893d149c96275b5808881
3
  size 16048
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e469f052cb05e5cf49a235d076e0d255d7c8ab10ecd0688597c90d24aa91271f
3
  size 16048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bc88c834df7dbdf28112af82a0fbc77b3633a0bd734c746cbd963868cd1638d
3
  size 16048
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4661e3c8087f3f1d2de392f52cf8afa125c865e290cc1f82744a9c62c75eb7b3
3
  size 16048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f202a252593acda702f147cf7bd3a31174b510a25497f7cd9e72cd64c938ddb
3
  size 16048
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:65e243bce56a5724d4707edf307955d94a50103507be1f90af964b7177780b77
3
  size 16048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa7fa98f002472e6436680d6e814a6a2fe757de9ae3aae9cbaba8f4114ee7b8c
3
  size 16048
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:854fedc83edb427d64d1a4e41243a4a9a600619ca1c3230bf82f3fe10f0df54a
3
  size 16048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a239bf4d6b122a3f95861bfe8a309a94daf464bb026a78fa5c15e4524a467f9
3
  size 16048
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cfd6018b3bcc09be37863312cd9f5a1abeb208b2c8175343ea9ccbd7c6111ccd
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d07ac0fea5745c187bb33a34833c94c0ef719c462575fe061a42340c90be4024
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": 200,
3
  "best_metric": 0.1393619030714035,
4
  "best_model_checkpoint": "miner_id_24/checkpoint-200",
5
- "epoch": 4.66193853427896,
6
  "eval_steps": 100,
7
- "global_step": 200,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1432,6 +1432,714 @@
1432
  "eval_samples_per_second": 148.12,
1433
  "eval_steps_per_second": 1.163,
1434
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1435
  }
1436
  ],
1437
  "logging_steps": 1,
@@ -1446,7 +2154,7 @@
1446
  "early_stopping_threshold": 0.0
1447
  },
1448
  "attributes": {
1449
- "early_stopping_patience_counter": 0
1450
  }
1451
  },
1452
  "TrainerControl": {
@@ -1455,12 +2163,12 @@
1455
  "should_evaluate": false,
1456
  "should_log": false,
1457
  "should_save": true,
1458
- "should_training_stop": false
1459
  },
1460
  "attributes": {}
1461
  }
1462
  },
1463
- "total_flos": 4.159623608500486e+18,
1464
  "train_batch_size": 3,
1465
  "trial_name": null,
1466
  "trial_params": null
 
2
  "best_global_step": 200,
3
  "best_metric": 0.1393619030714035,
4
  "best_model_checkpoint": "miner_id_24/checkpoint-200",
5
+ "epoch": 6.99290780141844,
6
  "eval_steps": 100,
7
+ "global_step": 300,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1432
  "eval_samples_per_second": 148.12,
1433
  "eval_steps_per_second": 1.163,
1434
  "step": 200
1435
+ },
1436
+ {
1437
+ "epoch": 4.685579196217494,
1438
+ "grad_norm": 0.24888208508491516,
1439
+ "learning_rate": 0.00012243745415914883,
1440
+ "loss": 0.0975,
1441
+ "step": 201
1442
+ },
1443
+ {
1444
+ "epoch": 4.709219858156028,
1445
+ "grad_norm": 0.22495469450950623,
1446
+ "learning_rate": 0.00012113826236296244,
1447
+ "loss": 0.0935,
1448
+ "step": 202
1449
+ },
1450
+ {
1451
+ "epoch": 4.732860520094563,
1452
+ "grad_norm": 0.20877662301063538,
1453
+ "learning_rate": 0.0001198353248183118,
1454
+ "loss": 0.095,
1455
+ "step": 203
1456
+ },
1457
+ {
1458
+ "epoch": 4.756501182033097,
1459
+ "grad_norm": 0.25470611453056335,
1460
+ "learning_rate": 0.00011852887240871145,
1461
+ "loss": 0.0952,
1462
+ "step": 204
1463
+ },
1464
+ {
1465
+ "epoch": 4.780141843971631,
1466
+ "grad_norm": 0.21587012708187103,
1467
+ "learning_rate": 0.00011721913664051813,
1468
+ "loss": 0.0836,
1469
+ "step": 205
1470
+ },
1471
+ {
1472
+ "epoch": 4.803782505910165,
1473
+ "grad_norm": 0.22231656312942505,
1474
+ "learning_rate": 0.00011590634960190721,
1475
+ "loss": 0.0911,
1476
+ "step": 206
1477
+ },
1478
+ {
1479
+ "epoch": 4.8274231678487,
1480
+ "grad_norm": 0.2475675344467163,
1481
+ "learning_rate": 0.00011459074392174618,
1482
+ "loss": 0.0937,
1483
+ "step": 207
1484
+ },
1485
+ {
1486
+ "epoch": 4.851063829787234,
1487
+ "grad_norm": 0.19742602109909058,
1488
+ "learning_rate": 0.00011327255272837221,
1489
+ "loss": 0.0973,
1490
+ "step": 208
1491
+ },
1492
+ {
1493
+ "epoch": 4.874704491725768,
1494
+ "grad_norm": 0.18842868506908417,
1495
+ "learning_rate": 0.00011195200960828139,
1496
+ "loss": 0.0888,
1497
+ "step": 209
1498
+ },
1499
+ {
1500
+ "epoch": 4.898345153664303,
1501
+ "grad_norm": 0.1946844905614853,
1502
+ "learning_rate": 0.00011062934856473655,
1503
+ "loss": 0.0903,
1504
+ "step": 210
1505
+ },
1506
+ {
1507
+ "epoch": 4.921985815602837,
1508
+ "grad_norm": 0.2090204656124115,
1509
+ "learning_rate": 0.00010930480397630145,
1510
+ "loss": 0.1069,
1511
+ "step": 211
1512
+ },
1513
+ {
1514
+ "epoch": 4.945626477541371,
1515
+ "grad_norm": 0.21296799182891846,
1516
+ "learning_rate": 0.00010797861055530831,
1517
+ "loss": 0.0993,
1518
+ "step": 212
1519
+ },
1520
+ {
1521
+ "epoch": 4.969267139479905,
1522
+ "grad_norm": 0.22559182345867157,
1523
+ "learning_rate": 0.00010665100330626625,
1524
+ "loss": 0.0937,
1525
+ "step": 213
1526
+ },
1527
+ {
1528
+ "epoch": 4.99290780141844,
1529
+ "grad_norm": 0.18918611109256744,
1530
+ "learning_rate": 0.00010532221748421787,
1531
+ "loss": 0.0943,
1532
+ "step": 214
1533
+ },
1534
+ {
1535
+ "epoch": 5.0,
1536
+ "grad_norm": 0.40800580382347107,
1537
+ "learning_rate": 0.00010399248855305176,
1538
+ "loss": 0.1196,
1539
+ "step": 215
1540
+ },
1541
+ {
1542
+ "epoch": 5.0236406619385345,
1543
+ "grad_norm": 0.23491446673870087,
1544
+ "learning_rate": 0.00010266205214377748,
1545
+ "loss": 0.0763,
1546
+ "step": 216
1547
+ },
1548
+ {
1549
+ "epoch": 5.047281323877068,
1550
+ "grad_norm": 0.24946476519107819,
1551
+ "learning_rate": 0.00010133114401277139,
1552
+ "loss": 0.0805,
1553
+ "step": 217
1554
+ },
1555
+ {
1556
+ "epoch": 5.070921985815603,
1557
+ "grad_norm": 0.23227405548095703,
1558
+ "learning_rate": 0.0001,
1559
+ "loss": 0.0732,
1560
+ "step": 218
1561
+ },
1562
+ {
1563
+ "epoch": 5.094562647754137,
1564
+ "grad_norm": 0.24616649746894836,
1565
+ "learning_rate": 9.866885598722863e-05,
1566
+ "loss": 0.0867,
1567
+ "step": 219
1568
+ },
1569
+ {
1570
+ "epoch": 5.118203309692672,
1571
+ "grad_norm": 0.24532361328601837,
1572
+ "learning_rate": 9.733794785622253e-05,
1573
+ "loss": 0.0908,
1574
+ "step": 220
1575
+ },
1576
+ {
1577
+ "epoch": 5.141843971631205,
1578
+ "grad_norm": 0.19941219687461853,
1579
+ "learning_rate": 9.600751144694827e-05,
1580
+ "loss": 0.0799,
1581
+ "step": 221
1582
+ },
1583
+ {
1584
+ "epoch": 5.16548463356974,
1585
+ "grad_norm": 0.20473811030387878,
1586
+ "learning_rate": 9.467778251578217e-05,
1587
+ "loss": 0.0796,
1588
+ "step": 222
1589
+ },
1590
+ {
1591
+ "epoch": 5.1891252955082745,
1592
+ "grad_norm": 0.222214475274086,
1593
+ "learning_rate": 9.334899669373379e-05,
1594
+ "loss": 0.0785,
1595
+ "step": 223
1596
+ },
1597
+ {
1598
+ "epoch": 5.212765957446808,
1599
+ "grad_norm": 0.21746733784675598,
1600
+ "learning_rate": 9.202138944469168e-05,
1601
+ "loss": 0.0725,
1602
+ "step": 224
1603
+ },
1604
+ {
1605
+ "epoch": 5.236406619385343,
1606
+ "grad_norm": 0.203547403216362,
1607
+ "learning_rate": 9.069519602369856e-05,
1608
+ "loss": 0.0773,
1609
+ "step": 225
1610
+ },
1611
+ {
1612
+ "epoch": 5.260047281323877,
1613
+ "grad_norm": 0.24523097276687622,
1614
+ "learning_rate": 8.937065143526347e-05,
1615
+ "loss": 0.082,
1616
+ "step": 226
1617
+ },
1618
+ {
1619
+ "epoch": 5.283687943262412,
1620
+ "grad_norm": 0.23100948333740234,
1621
+ "learning_rate": 8.804799039171863e-05,
1622
+ "loss": 0.0759,
1623
+ "step": 227
1624
+ },
1625
+ {
1626
+ "epoch": 5.307328605200945,
1627
+ "grad_norm": 0.2774072289466858,
1628
+ "learning_rate": 8.672744727162781e-05,
1629
+ "loss": 0.0857,
1630
+ "step": 228
1631
+ },
1632
+ {
1633
+ "epoch": 5.33096926713948,
1634
+ "grad_norm": 0.24797679483890533,
1635
+ "learning_rate": 8.540925607825384e-05,
1636
+ "loss": 0.0766,
1637
+ "step": 229
1638
+ },
1639
+ {
1640
+ "epoch": 5.3546099290780145,
1641
+ "grad_norm": 0.20143181085586548,
1642
+ "learning_rate": 8.409365039809281e-05,
1643
+ "loss": 0.0828,
1644
+ "step": 230
1645
+ },
1646
+ {
1647
+ "epoch": 5.378250591016548,
1648
+ "grad_norm": 0.2065824419260025,
1649
+ "learning_rate": 8.27808633594819e-05,
1650
+ "loss": 0.0742,
1651
+ "step": 231
1652
+ },
1653
+ {
1654
+ "epoch": 5.401891252955083,
1655
+ "grad_norm": 0.22358693182468414,
1656
+ "learning_rate": 8.147112759128859e-05,
1657
+ "loss": 0.0706,
1658
+ "step": 232
1659
+ },
1660
+ {
1661
+ "epoch": 5.425531914893617,
1662
+ "grad_norm": 0.24426457285881042,
1663
+ "learning_rate": 8.016467518168821e-05,
1664
+ "loss": 0.0773,
1665
+ "step": 233
1666
+ },
1667
+ {
1668
+ "epoch": 5.449172576832151,
1669
+ "grad_norm": 0.18924954533576965,
1670
+ "learning_rate": 7.886173763703757e-05,
1671
+ "loss": 0.0752,
1672
+ "step": 234
1673
+ },
1674
+ {
1675
+ "epoch": 5.472813238770685,
1676
+ "grad_norm": 0.24037088453769684,
1677
+ "learning_rate": 7.756254584085121e-05,
1678
+ "loss": 0.084,
1679
+ "step": 235
1680
+ },
1681
+ {
1682
+ "epoch": 5.49645390070922,
1683
+ "grad_norm": 0.2293759435415268,
1684
+ "learning_rate": 7.626733001288851e-05,
1685
+ "loss": 0.0669,
1686
+ "step": 236
1687
+ },
1688
+ {
1689
+ "epoch": 5.520094562647754,
1690
+ "grad_norm": 0.1983073204755783,
1691
+ "learning_rate": 7.497631966835828e-05,
1692
+ "loss": 0.0823,
1693
+ "step": 237
1694
+ },
1695
+ {
1696
+ "epoch": 5.543735224586288,
1697
+ "grad_norm": 0.2341061383485794,
1698
+ "learning_rate": 7.368974357724789e-05,
1699
+ "loss": 0.0882,
1700
+ "step": 238
1701
+ },
1702
+ {
1703
+ "epoch": 5.567375886524823,
1704
+ "grad_norm": 0.1973034292459488,
1705
+ "learning_rate": 7.240782972378496e-05,
1706
+ "loss": 0.0671,
1707
+ "step": 239
1708
+ },
1709
+ {
1710
+ "epoch": 5.591016548463357,
1711
+ "grad_norm": 0.19070158898830414,
1712
+ "learning_rate": 7.113080526603792e-05,
1713
+ "loss": 0.0837,
1714
+ "step": 240
1715
+ },
1716
+ {
1717
+ "epoch": 5.614657210401891,
1718
+ "grad_norm": 0.2356303334236145,
1719
+ "learning_rate": 6.985889649566305e-05,
1720
+ "loss": 0.0933,
1721
+ "step": 241
1722
+ },
1723
+ {
1724
+ "epoch": 5.638297872340425,
1725
+ "grad_norm": 0.2121330201625824,
1726
+ "learning_rate": 6.859232879780515e-05,
1727
+ "loss": 0.0823,
1728
+ "step": 242
1729
+ },
1730
+ {
1731
+ "epoch": 5.66193853427896,
1732
+ "grad_norm": 0.20877498388290405,
1733
+ "learning_rate": 6.73313266111587e-05,
1734
+ "loss": 0.0899,
1735
+ "step": 243
1736
+ },
1737
+ {
1738
+ "epoch": 5.685579196217494,
1739
+ "grad_norm": 0.21572048962116241,
1740
+ "learning_rate": 6.607611338819697e-05,
1741
+ "loss": 0.0749,
1742
+ "step": 244
1743
+ },
1744
+ {
1745
+ "epoch": 5.709219858156028,
1746
+ "grad_norm": 0.19401253759860992,
1747
+ "learning_rate": 6.48269115555755e-05,
1748
+ "loss": 0.0718,
1749
+ "step": 245
1750
+ },
1751
+ {
1752
+ "epoch": 5.732860520094563,
1753
+ "grad_norm": 0.20852094888687134,
1754
+ "learning_rate": 6.358394247471778e-05,
1755
+ "loss": 0.0754,
1756
+ "step": 246
1757
+ },
1758
+ {
1759
+ "epoch": 5.756501182033097,
1760
+ "grad_norm": 0.2070273458957672,
1761
+ "learning_rate": 6.234742640258938e-05,
1762
+ "loss": 0.0733,
1763
+ "step": 247
1764
+ },
1765
+ {
1766
+ "epoch": 5.780141843971631,
1767
+ "grad_norm": 0.1823720633983612,
1768
+ "learning_rate": 6.111758245266794e-05,
1769
+ "loss": 0.0636,
1770
+ "step": 248
1771
+ },
1772
+ {
1773
+ "epoch": 5.803782505910165,
1774
+ "grad_norm": 0.2146531492471695,
1775
+ "learning_rate": 5.9894628556115854e-05,
1776
+ "loss": 0.0821,
1777
+ "step": 249
1778
+ },
1779
+ {
1780
+ "epoch": 5.8274231678487,
1781
+ "grad_norm": 0.20586134493350983,
1782
+ "learning_rate": 5.867878142316221e-05,
1783
+ "loss": 0.0861,
1784
+ "step": 250
1785
+ },
1786
+ {
1787
+ "epoch": 5.851063829787234,
1788
+ "grad_norm": 0.1832318753004074,
1789
+ "learning_rate": 5.7470256504701347e-05,
1790
+ "loss": 0.0694,
1791
+ "step": 251
1792
+ },
1793
+ {
1794
+ "epoch": 5.874704491725768,
1795
+ "grad_norm": 0.17847847938537598,
1796
+ "learning_rate": 5.626926795411447e-05,
1797
+ "loss": 0.0748,
1798
+ "step": 252
1799
+ },
1800
+ {
1801
+ "epoch": 5.898345153664303,
1802
+ "grad_norm": 0.19474737346172333,
1803
+ "learning_rate": 5.507602858932113e-05,
1804
+ "loss": 0.0754,
1805
+ "step": 253
1806
+ },
1807
+ {
1808
+ "epoch": 5.921985815602837,
1809
+ "grad_norm": 0.20228345692157745,
1810
+ "learning_rate": 5.38907498550674e-05,
1811
+ "loss": 0.0741,
1812
+ "step": 254
1813
+ },
1814
+ {
1815
+ "epoch": 5.945626477541371,
1816
+ "grad_norm": 0.19571395218372345,
1817
+ "learning_rate": 5.27136417854575e-05,
1818
+ "loss": 0.0808,
1819
+ "step": 255
1820
+ },
1821
+ {
1822
+ "epoch": 5.969267139479905,
1823
+ "grad_norm": 0.1964896023273468,
1824
+ "learning_rate": 5.1544912966734994e-05,
1825
+ "loss": 0.0722,
1826
+ "step": 256
1827
+ },
1828
+ {
1829
+ "epoch": 5.99290780141844,
1830
+ "grad_norm": 0.21053136885166168,
1831
+ "learning_rate": 5.0384770500321176e-05,
1832
+ "loss": 0.0748,
1833
+ "step": 257
1834
+ },
1835
+ {
1836
+ "epoch": 6.0,
1837
+ "grad_norm": 0.32032114267349243,
1838
+ "learning_rate": 4.9233419966116036e-05,
1839
+ "loss": 0.0792,
1840
+ "step": 258
1841
+ },
1842
+ {
1843
+ "epoch": 6.0236406619385345,
1844
+ "grad_norm": 0.18689100444316864,
1845
+ "learning_rate": 4.809106538606896e-05,
1846
+ "loss": 0.0672,
1847
+ "step": 259
1848
+ },
1849
+ {
1850
+ "epoch": 6.047281323877068,
1851
+ "grad_norm": 0.19790929555892944,
1852
+ "learning_rate": 4.695790918802576e-05,
1853
+ "loss": 0.0612,
1854
+ "step": 260
1855
+ },
1856
+ {
1857
+ "epoch": 6.070921985815603,
1858
+ "grad_norm": 0.17803865671157837,
1859
+ "learning_rate": 4.58341521698579e-05,
1860
+ "loss": 0.0567,
1861
+ "step": 261
1862
+ },
1863
+ {
1864
+ "epoch": 6.094562647754137,
1865
+ "grad_norm": 0.16323284804821014,
1866
+ "learning_rate": 4.47199934638807e-05,
1867
+ "loss": 0.0623,
1868
+ "step": 262
1869
+ },
1870
+ {
1871
+ "epoch": 6.118203309692672,
1872
+ "grad_norm": 0.183246910572052,
1873
+ "learning_rate": 4.3615630501566384e-05,
1874
+ "loss": 0.0727,
1875
+ "step": 263
1876
+ },
1877
+ {
1878
+ "epoch": 6.141843971631205,
1879
+ "grad_norm": 0.1922691911458969,
1880
+ "learning_rate": 4.252125897855932e-05,
1881
+ "loss": 0.0729,
1882
+ "step": 264
1883
+ },
1884
+ {
1885
+ "epoch": 6.16548463356974,
1886
+ "grad_norm": 0.18657496571540833,
1887
+ "learning_rate": 4.143707281999767e-05,
1888
+ "loss": 0.0601,
1889
+ "step": 265
1890
+ },
1891
+ {
1892
+ "epoch": 6.1891252955082745,
1893
+ "grad_norm": 0.1704358607530594,
1894
+ "learning_rate": 4.036326414614985e-05,
1895
+ "loss": 0.0677,
1896
+ "step": 266
1897
+ },
1898
+ {
1899
+ "epoch": 6.212765957446808,
1900
+ "grad_norm": 0.1788199245929718,
1901
+ "learning_rate": 3.930002323837025e-05,
1902
+ "loss": 0.0605,
1903
+ "step": 267
1904
+ },
1905
+ {
1906
+ "epoch": 6.236406619385343,
1907
+ "grad_norm": 0.1892111450433731,
1908
+ "learning_rate": 3.824753850538082e-05,
1909
+ "loss": 0.0621,
1910
+ "step": 268
1911
+ },
1912
+ {
1913
+ "epoch": 6.260047281323877,
1914
+ "grad_norm": 0.1900961846113205,
1915
+ "learning_rate": 3.720599644988482e-05,
1916
+ "loss": 0.0727,
1917
+ "step": 269
1918
+ },
1919
+ {
1920
+ "epoch": 6.283687943262412,
1921
+ "grad_norm": 0.25505387783050537,
1922
+ "learning_rate": 3.617558163551802e-05,
1923
+ "loss": 0.0639,
1924
+ "step": 270
1925
+ },
1926
+ {
1927
+ "epoch": 6.307328605200945,
1928
+ "grad_norm": 0.17928794026374817,
1929
+ "learning_rate": 3.5156476654143497e-05,
1930
+ "loss": 0.0595,
1931
+ "step": 271
1932
+ },
1933
+ {
1934
+ "epoch": 6.33096926713948,
1935
+ "grad_norm": 0.17975100874900818,
1936
+ "learning_rate": 3.414886209349615e-05,
1937
+ "loss": 0.0697,
1938
+ "step": 272
1939
+ },
1940
+ {
1941
+ "epoch": 6.3546099290780145,
1942
+ "grad_norm": 0.16846145689487457,
1943
+ "learning_rate": 3.315291650518197e-05,
1944
+ "loss": 0.0593,
1945
+ "step": 273
1946
+ },
1947
+ {
1948
+ "epoch": 6.378250591016548,
1949
+ "grad_norm": 0.15943646430969238,
1950
+ "learning_rate": 3.216881637303839e-05,
1951
+ "loss": 0.0597,
1952
+ "step": 274
1953
+ },
1954
+ {
1955
+ "epoch": 6.401891252955083,
1956
+ "grad_norm": 0.16623468697071075,
1957
+ "learning_rate": 3.119673608186085e-05,
1958
+ "loss": 0.0595,
1959
+ "step": 275
1960
+ },
1961
+ {
1962
+ "epoch": 6.425531914893617,
1963
+ "grad_norm": 0.17790904641151428,
1964
+ "learning_rate": 3.0236847886501542e-05,
1965
+ "loss": 0.0604,
1966
+ "step": 276
1967
+ },
1968
+ {
1969
+ "epoch": 6.449172576832151,
1970
+ "grad_norm": 0.18511582911014557,
1971
+ "learning_rate": 2.9289321881345254e-05,
1972
+ "loss": 0.0678,
1973
+ "step": 277
1974
+ },
1975
+ {
1976
+ "epoch": 6.472813238770685,
1977
+ "grad_norm": 0.17497338354587555,
1978
+ "learning_rate": 2.8354325970168484e-05,
1979
+ "loss": 0.0568,
1980
+ "step": 278
1981
+ },
1982
+ {
1983
+ "epoch": 6.49645390070922,
1984
+ "grad_norm": 0.1610943078994751,
1985
+ "learning_rate": 2.743202583638641e-05,
1986
+ "loss": 0.068,
1987
+ "step": 279
1988
+ },
1989
+ {
1990
+ "epoch": 6.520094562647754,
1991
+ "grad_norm": 0.1880873739719391,
1992
+ "learning_rate": 2.6522584913693294e-05,
1993
+ "loss": 0.06,
1994
+ "step": 280
1995
+ },
1996
+ {
1997
+ "epoch": 6.543735224586288,
1998
+ "grad_norm": 0.17921674251556396,
1999
+ "learning_rate": 2.5626164357101857e-05,
2000
+ "loss": 0.0593,
2001
+ "step": 281
2002
+ },
2003
+ {
2004
+ "epoch": 6.567375886524823,
2005
+ "grad_norm": 0.17583315074443817,
2006
+ "learning_rate": 2.4742923014386156e-05,
2007
+ "loss": 0.0664,
2008
+ "step": 282
2009
+ },
2010
+ {
2011
+ "epoch": 6.591016548463357,
2012
+ "grad_norm": 0.19071973860263824,
2013
+ "learning_rate": 2.3873017397933327e-05,
2014
+ "loss": 0.0644,
2015
+ "step": 283
2016
+ },
2017
+ {
2018
+ "epoch": 6.614657210401891,
2019
+ "grad_norm": 0.1757621169090271,
2020
+ "learning_rate": 2.301660165700936e-05,
2021
+ "loss": 0.0612,
2022
+ "step": 284
2023
+ },
2024
+ {
2025
+ "epoch": 6.638297872340425,
2026
+ "grad_norm": 0.15712977945804596,
2027
+ "learning_rate": 2.2173827550443417e-05,
2028
+ "loss": 0.0663,
2029
+ "step": 285
2030
+ },
2031
+ {
2032
+ "epoch": 6.66193853427896,
2033
+ "grad_norm": 0.16134823858737946,
2034
+ "learning_rate": 2.1344844419735755e-05,
2035
+ "loss": 0.0551,
2036
+ "step": 286
2037
+ },
2038
+ {
2039
+ "epoch": 6.685579196217494,
2040
+ "grad_norm": 0.168061301112175,
2041
+ "learning_rate": 2.0529799162594244e-05,
2042
+ "loss": 0.06,
2043
+ "step": 287
2044
+ },
2045
+ {
2046
+ "epoch": 6.709219858156028,
2047
+ "grad_norm": 0.1770693063735962,
2048
+ "learning_rate": 1.9728836206903656e-05,
2049
+ "loss": 0.0664,
2050
+ "step": 288
2051
+ },
2052
+ {
2053
+ "epoch": 6.732860520094563,
2054
+ "grad_norm": 0.18103648722171783,
2055
+ "learning_rate": 1.8942097485132626e-05,
2056
+ "loss": 0.062,
2057
+ "step": 289
2058
+ },
2059
+ {
2060
+ "epoch": 6.756501182033097,
2061
+ "grad_norm": 0.18184252083301544,
2062
+ "learning_rate": 1.8169722409183097e-05,
2063
+ "loss": 0.059,
2064
+ "step": 290
2065
+ },
2066
+ {
2067
+ "epoch": 6.780141843971631,
2068
+ "grad_norm": 0.1702430248260498,
2069
+ "learning_rate": 1.741184784568608e-05,
2070
+ "loss": 0.062,
2071
+ "step": 291
2072
+ },
2073
+ {
2074
+ "epoch": 6.803782505910165,
2075
+ "grad_norm": 0.16067641973495483,
2076
+ "learning_rate": 1.6668608091748495e-05,
2077
+ "loss": 0.0574,
2078
+ "step": 292
2079
+ },
2080
+ {
2081
+ "epoch": 6.8274231678487,
2082
+ "grad_norm": 0.1779567003250122,
2083
+ "learning_rate": 1.5940134851155697e-05,
2084
+ "loss": 0.0593,
2085
+ "step": 293
2086
+ },
2087
+ {
2088
+ "epoch": 6.851063829787234,
2089
+ "grad_norm": 0.17295385897159576,
2090
+ "learning_rate": 1.522655721103291e-05,
2091
+ "loss": 0.0695,
2092
+ "step": 294
2093
+ },
2094
+ {
2095
+ "epoch": 6.874704491725768,
2096
+ "grad_norm": 0.1924130916595459,
2097
+ "learning_rate": 1.4528001618970966e-05,
2098
+ "loss": 0.0719,
2099
+ "step": 295
2100
+ },
2101
+ {
2102
+ "epoch": 6.898345153664303,
2103
+ "grad_norm": 0.17258019745349884,
2104
+ "learning_rate": 1.3844591860619383e-05,
2105
+ "loss": 0.0646,
2106
+ "step": 296
2107
+ },
2108
+ {
2109
+ "epoch": 6.921985815602837,
2110
+ "grad_norm": 0.17023594677448273,
2111
+ "learning_rate": 1.3176449037751293e-05,
2112
+ "loss": 0.0608,
2113
+ "step": 297
2114
+ },
2115
+ {
2116
+ "epoch": 6.945626477541371,
2117
+ "grad_norm": 0.1798073947429657,
2118
+ "learning_rate": 1.2523691546803873e-05,
2119
+ "loss": 0.0774,
2120
+ "step": 298
2121
+ },
2122
+ {
2123
+ "epoch": 6.969267139479905,
2124
+ "grad_norm": 0.1567268669605255,
2125
+ "learning_rate": 1.1886435057898337e-05,
2126
+ "loss": 0.0809,
2127
+ "step": 299
2128
+ },
2129
+ {
2130
+ "epoch": 6.99290780141844,
2131
+ "grad_norm": 0.1746884137392044,
2132
+ "learning_rate": 1.1264792494342857e-05,
2133
+ "loss": 0.0597,
2134
+ "step": 300
2135
+ },
2136
+ {
2137
+ "epoch": 6.99290780141844,
2138
+ "eval_loss": 0.13989537954330444,
2139
+ "eval_runtime": 26.7265,
2140
+ "eval_samples_per_second": 147.756,
2141
+ "eval_steps_per_second": 1.16,
2142
+ "step": 300
2143
  }
2144
  ],
2145
  "logging_steps": 1,
 
2154
  "early_stopping_threshold": 0.0
2155
  },
2156
  "attributes": {
2157
+ "early_stopping_patience_counter": 1
2158
  }
2159
  },
2160
  "TrainerControl": {
 
2163
  "should_evaluate": false,
2164
  "should_log": false,
2165
  "should_save": true,
2166
+ "should_training_stop": true
2167
  },
2168
  "attributes": {}
2169
  }
2170
  },
2171
+ "total_flos": 6.239523305766781e+18,
2172
  "train_batch_size": 3,
2173
  "trial_name": null,
2174
  "trial_params": null