mtzig commited on
Commit
227aedf
·
verified ·
1 Parent(s): 16443fe

Training in progress, step 300, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c0ba6fb33cfa34a3a19c1c859523ba5b8ee34e4ce14cd7ee85604eeb2a478122
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3720c9d7c3d147e3c6a4c31b748a401804a80a6f28683e7c9983f2e8c8a0f20a
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:af5a9ded1ac1ad15369c22168c9bdc24120369b807b0236304fc238cd01770cd
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3638a66f29f136c13174bee0dd43e693f5fc102e10bee4ca9b5d7060756ced7c
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aac30f6108157475f0f5a525a5d713d9caa83e68412d6e9feee34fa1c788d678
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59454dac3aeba9c46e0b8ed50eb871c1d98271ecbabd9754c8618cdc65584ad9
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:39a8a076cc77e594caeb9e94a5de64b7a427c01ec5dd10b1dbe76fa77717e2cd
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93ca40f1a301b546ff3c8e51d511eb49571367df816e5ac6c43d69b073ba1e35
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:28d745959ce06db61825f40fb63ea63b9e62f268815aaab31b17f3705247564b
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e305eb64bbd004a3712d4d0f3b65560d9c0d8b3920c2789be35be33fef333cd
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1c5600d73cdd63622761d1592bb37fa01c39c9bcc957af85fe4bd2e4cd01fabc
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16b6721393030c99c98218e1bcc44fa93cc347e7c920295cebba18bf69ebf311
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:be56560930f7a89d468b530340537eec5b918174080267a8cc6186d4978acf89
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9c2af98820448e537c2aa09618f8c2299b2ed8c9040abdad7cc23d455398738
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:851b0a400aa71d9fc3d83e0e2570f4bbeaf98efc2e51c1f18c4d64aa51f39304
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa8dfa614952af057b305d24646b204cdfa9b2fbf5610aa112de72f4d1903dd4
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:805afa176b455b67a891f7c63c255879dd3a372d6c9fa2140f3c0a2149d52710
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6185843c50764de20922699c89193c33e1e13037719a5d55479aa190e715e4fc
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:656385b8033d1cc9de4c8239cf888e2d83a5db8f95016de71e971858eab1c195
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35e51ecf57078c2d652964726d8abc8157e10e9fdddf8cacb5700305b465147a
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8a4775b283f1cbab74e1bfc47bfbe045632e0a9c46d8f354762f3216e862bf61
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebe10fe55b3a58ae13fa7a58fca8f2486fa82c4aa360522ee9cde43cc43ba473
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3ecbc04b6bcc44f7032a40edb9b3c06e3acf5ba0f1fb508b9a44802995aad5b9
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcda73faaa8d5a9ab0a72d2fef1c1af0341c8e7f8ec0eede744acae39dd22f43
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:88d5a351fddcb4718730dd82c69354176cd179de4c82fa6d41e0282fb5e2ab11
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3e47edb1b664bc04c493b0996774157c1ffdb9f0b12df515a0b32829d748704
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5747126436781609,
5
  "eval_steps": 20,
6
- "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1539,6 +1539,766 @@
1539
  "eval_samples_per_second": 6.49,
1540
  "eval_steps_per_second": 0.245,
1541
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1542
  }
1543
  ],
1544
  "logging_steps": 1,
@@ -1558,7 +2318,7 @@
1558
  "attributes": {}
1559
  }
1560
  },
1561
- "total_flos": 6.331748017661542e+16,
1562
  "train_batch_size": 8,
1563
  "trial_name": null,
1564
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.8620689655172413,
5
  "eval_steps": 20,
6
+ "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1539
  "eval_samples_per_second": 6.49,
1540
  "eval_steps_per_second": 0.245,
1541
  "step": 200
1542
+ },
1543
+ {
1544
+ "epoch": 0.5775862068965517,
1545
+ "grad_norm": 3.761861801147461,
1546
+ "learning_rate": 9.047925700872552e-06,
1547
+ "loss": 0.3089,
1548
+ "step": 201
1549
+ },
1550
+ {
1551
+ "epoch": 0.5804597701149425,
1552
+ "grad_norm": 2.631603240966797,
1553
+ "learning_rate": 8.948060899634846e-06,
1554
+ "loss": 0.2943,
1555
+ "step": 202
1556
+ },
1557
+ {
1558
+ "epoch": 0.5833333333333334,
1559
+ "grad_norm": 2.613485336303711,
1560
+ "learning_rate": 8.848302072078762e-06,
1561
+ "loss": 0.2866,
1562
+ "step": 203
1563
+ },
1564
+ {
1565
+ "epoch": 0.5862068965517241,
1566
+ "grad_norm": 2.1991889476776123,
1567
+ "learning_rate": 8.748659268035339e-06,
1568
+ "loss": 0.2749,
1569
+ "step": 204
1570
+ },
1571
+ {
1572
+ "epoch": 0.5890804597701149,
1573
+ "grad_norm": 2.7884528636932373,
1574
+ "learning_rate": 8.649142525647271e-06,
1575
+ "loss": 0.3378,
1576
+ "step": 205
1577
+ },
1578
+ {
1579
+ "epoch": 0.5919540229885057,
1580
+ "grad_norm": 4.045213222503662,
1581
+ "learning_rate": 8.549761870357633e-06,
1582
+ "loss": 0.3398,
1583
+ "step": 206
1584
+ },
1585
+ {
1586
+ "epoch": 0.5948275862068966,
1587
+ "grad_norm": 2.83146595954895,
1588
+ "learning_rate": 8.450527313899923e-06,
1589
+ "loss": 0.2951,
1590
+ "step": 207
1591
+ },
1592
+ {
1593
+ "epoch": 0.5977011494252874,
1594
+ "grad_norm": 2.664379358291626,
1595
+ "learning_rate": 8.351448853289448e-06,
1596
+ "loss": 0.3452,
1597
+ "step": 208
1598
+ },
1599
+ {
1600
+ "epoch": 0.6005747126436781,
1601
+ "grad_norm": 2.2802395820617676,
1602
+ "learning_rate": 8.25253646981622e-06,
1603
+ "loss": 0.2993,
1604
+ "step": 209
1605
+ },
1606
+ {
1607
+ "epoch": 0.603448275862069,
1608
+ "grad_norm": 2.1689095497131348,
1609
+ "learning_rate": 8.153800128039441e-06,
1610
+ "loss": 0.2847,
1611
+ "step": 210
1612
+ },
1613
+ {
1614
+ "epoch": 0.6063218390804598,
1615
+ "grad_norm": 3.4026296138763428,
1616
+ "learning_rate": 8.05524977478364e-06,
1617
+ "loss": 0.3192,
1618
+ "step": 211
1619
+ },
1620
+ {
1621
+ "epoch": 0.6091954022988506,
1622
+ "grad_norm": 3.272836923599243,
1623
+ "learning_rate": 7.956895338136618e-06,
1624
+ "loss": 0.3129,
1625
+ "step": 212
1626
+ },
1627
+ {
1628
+ "epoch": 0.6120689655172413,
1629
+ "grad_norm": 4.036294460296631,
1630
+ "learning_rate": 7.858746726449309e-06,
1631
+ "loss": 0.3199,
1632
+ "step": 213
1633
+ },
1634
+ {
1635
+ "epoch": 0.6149425287356322,
1636
+ "grad_norm": 2.3944919109344482,
1637
+ "learning_rate": 7.760813827337555e-06,
1638
+ "loss": 0.2513,
1639
+ "step": 214
1640
+ },
1641
+ {
1642
+ "epoch": 0.617816091954023,
1643
+ "grad_norm": 4.011943340301514,
1644
+ "learning_rate": 7.663106506686057e-06,
1645
+ "loss": 0.3026,
1646
+ "step": 215
1647
+ },
1648
+ {
1649
+ "epoch": 0.6206896551724138,
1650
+ "grad_norm": 2.424299716949463,
1651
+ "learning_rate": 7.565634607654453e-06,
1652
+ "loss": 0.3029,
1653
+ "step": 216
1654
+ },
1655
+ {
1656
+ "epoch": 0.6235632183908046,
1657
+ "grad_norm": 5.497297286987305,
1658
+ "learning_rate": 7.468407949685695e-06,
1659
+ "loss": 0.3601,
1660
+ "step": 217
1661
+ },
1662
+ {
1663
+ "epoch": 0.6264367816091954,
1664
+ "grad_norm": 2.9515810012817383,
1665
+ "learning_rate": 7.371436327516854e-06,
1666
+ "loss": 0.328,
1667
+ "step": 218
1668
+ },
1669
+ {
1670
+ "epoch": 0.6293103448275862,
1671
+ "grad_norm": 3.359109878540039,
1672
+ "learning_rate": 7.274729510192367e-06,
1673
+ "loss": 0.3104,
1674
+ "step": 219
1675
+ },
1676
+ {
1677
+ "epoch": 0.632183908045977,
1678
+ "grad_norm": 3.1870110034942627,
1679
+ "learning_rate": 7.1782972400798825e-06,
1680
+ "loss": 0.34,
1681
+ "step": 220
1682
+ },
1683
+ {
1684
+ "epoch": 0.632183908045977,
1685
+ "eval_accuracy": 0.8688118811881188,
1686
+ "eval_f1": 0.7464114832535885,
1687
+ "eval_loss": 0.31775230169296265,
1688
+ "eval_precision": 0.7572815533980582,
1689
+ "eval_recall": 0.7358490566037735,
1690
+ "eval_runtime": 17.2507,
1691
+ "eval_samples_per_second": 6.145,
1692
+ "eval_steps_per_second": 0.232,
1693
+ "step": 220
1694
+ },
1695
+ {
1696
+ "epoch": 0.6350574712643678,
1697
+ "grad_norm": 2.9183595180511475,
1698
+ "learning_rate": 7.082149231888833e-06,
1699
+ "loss": 0.2827,
1700
+ "step": 221
1701
+ },
1702
+ {
1703
+ "epoch": 0.6379310344827587,
1704
+ "grad_norm": 4.59254789352417,
1705
+ "learning_rate": 6.986295171691727e-06,
1706
+ "loss": 0.3284,
1707
+ "step": 222
1708
+ },
1709
+ {
1710
+ "epoch": 0.6408045977011494,
1711
+ "grad_norm": 2.9099929332733154,
1712
+ "learning_rate": 6.890744715948388e-06,
1713
+ "loss": 0.2972,
1714
+ "step": 223
1715
+ },
1716
+ {
1717
+ "epoch": 0.6436781609195402,
1718
+ "grad_norm": 2.407136917114258,
1719
+ "learning_rate": 6.795507490533142e-06,
1720
+ "loss": 0.2973,
1721
+ "step": 224
1722
+ },
1723
+ {
1724
+ "epoch": 0.646551724137931,
1725
+ "grad_norm": 2.786597728729248,
1726
+ "learning_rate": 6.700593089765086e-06,
1727
+ "loss": 0.3426,
1728
+ "step": 225
1729
+ },
1730
+ {
1731
+ "epoch": 0.6494252873563219,
1732
+ "grad_norm": 2.642282485961914,
1733
+ "learning_rate": 6.606011075441556e-06,
1734
+ "loss": 0.3201,
1735
+ "step": 226
1736
+ },
1737
+ {
1738
+ "epoch": 0.6522988505747126,
1739
+ "grad_norm": 2.1382575035095215,
1740
+ "learning_rate": 6.511770975874862e-06,
1741
+ "loss": 0.2767,
1742
+ "step": 227
1743
+ },
1744
+ {
1745
+ "epoch": 0.6551724137931034,
1746
+ "grad_norm": 4.03010892868042,
1747
+ "learning_rate": 6.417882284932373e-06,
1748
+ "loss": 0.2742,
1749
+ "step": 228
1750
+ },
1751
+ {
1752
+ "epoch": 0.6580459770114943,
1753
+ "grad_norm": 2.754526138305664,
1754
+ "learning_rate": 6.324354461080121e-06,
1755
+ "loss": 0.2672,
1756
+ "step": 229
1757
+ },
1758
+ {
1759
+ "epoch": 0.6609195402298851,
1760
+ "grad_norm": 2.4932737350463867,
1761
+ "learning_rate": 6.231196926429913e-06,
1762
+ "loss": 0.2835,
1763
+ "step": 230
1764
+ },
1765
+ {
1766
+ "epoch": 0.6637931034482759,
1767
+ "grad_norm": 4.364743232727051,
1768
+ "learning_rate": 6.138419065790169e-06,
1769
+ "loss": 0.3079,
1770
+ "step": 231
1771
+ },
1772
+ {
1773
+ "epoch": 0.6666666666666666,
1774
+ "grad_norm": 2.9517085552215576,
1775
+ "learning_rate": 6.046030225720456e-06,
1776
+ "loss": 0.2422,
1777
+ "step": 232
1778
+ },
1779
+ {
1780
+ "epoch": 0.6695402298850575,
1781
+ "grad_norm": 2.8469698429107666,
1782
+ "learning_rate": 5.95403971358991e-06,
1783
+ "loss": 0.2641,
1784
+ "step": 233
1785
+ },
1786
+ {
1787
+ "epoch": 0.6724137931034483,
1788
+ "grad_norm": 3.698885440826416,
1789
+ "learning_rate": 5.86245679663962e-06,
1790
+ "loss": 0.3572,
1791
+ "step": 234
1792
+ },
1793
+ {
1794
+ "epoch": 0.6752873563218391,
1795
+ "grad_norm": 2.205653667449951,
1796
+ "learning_rate": 5.7712907010490036e-06,
1797
+ "loss": 0.252,
1798
+ "step": 235
1799
+ },
1800
+ {
1801
+ "epoch": 0.6781609195402298,
1802
+ "grad_norm": 3.1082050800323486,
1803
+ "learning_rate": 5.680550611006372e-06,
1804
+ "loss": 0.3062,
1805
+ "step": 236
1806
+ },
1807
+ {
1808
+ "epoch": 0.6810344827586207,
1809
+ "grad_norm": 4.51684045791626,
1810
+ "learning_rate": 5.590245667783701e-06,
1811
+ "loss": 0.281,
1812
+ "step": 237
1813
+ },
1814
+ {
1815
+ "epoch": 0.6839080459770115,
1816
+ "grad_norm": 2.53916335105896,
1817
+ "learning_rate": 5.5003849688157075e-06,
1818
+ "loss": 0.3273,
1819
+ "step": 238
1820
+ },
1821
+ {
1822
+ "epoch": 0.6867816091954023,
1823
+ "grad_norm": 3.1908535957336426,
1824
+ "learning_rate": 5.4109775667833866e-06,
1825
+ "loss": 0.3091,
1826
+ "step": 239
1827
+ },
1828
+ {
1829
+ "epoch": 0.6896551724137931,
1830
+ "grad_norm": 2.92702579498291,
1831
+ "learning_rate": 5.322032468702037e-06,
1832
+ "loss": 0.2969,
1833
+ "step": 240
1834
+ },
1835
+ {
1836
+ "epoch": 0.6896551724137931,
1837
+ "eval_accuracy": 0.8564356435643564,
1838
+ "eval_f1": 0.7289719626168224,
1839
+ "eval_loss": 0.3177642822265625,
1840
+ "eval_precision": 0.7222222222222222,
1841
+ "eval_recall": 0.7358490566037735,
1842
+ "eval_runtime": 16.3833,
1843
+ "eval_samples_per_second": 6.47,
1844
+ "eval_steps_per_second": 0.244,
1845
+ "step": 240
1846
+ },
1847
+ {
1848
+ "epoch": 0.6925287356321839,
1849
+ "grad_norm": 2.631377696990967,
1850
+ "learning_rate": 5.233558635013842e-06,
1851
+ "loss": 0.3108,
1852
+ "step": 241
1853
+ },
1854
+ {
1855
+ "epoch": 0.6954022988505747,
1856
+ "grad_norm": 2.5703439712524414,
1857
+ "learning_rate": 5.145564978685234e-06,
1858
+ "loss": 0.2965,
1859
+ "step": 242
1860
+ },
1861
+ {
1862
+ "epoch": 0.6982758620689655,
1863
+ "grad_norm": 5.4829254150390625,
1864
+ "learning_rate": 5.058060364308965e-06,
1865
+ "loss": 0.3302,
1866
+ "step": 243
1867
+ },
1868
+ {
1869
+ "epoch": 0.7011494252873564,
1870
+ "grad_norm": 4.077176570892334,
1871
+ "learning_rate": 4.971053607211069e-06,
1872
+ "loss": 0.3833,
1873
+ "step": 244
1874
+ },
1875
+ {
1876
+ "epoch": 0.7040229885057471,
1877
+ "grad_norm": 3.5669100284576416,
1878
+ "learning_rate": 4.884553472562809e-06,
1879
+ "loss": 0.3203,
1880
+ "step": 245
1881
+ },
1882
+ {
1883
+ "epoch": 0.7068965517241379,
1884
+ "grad_norm": 2.850348472595215,
1885
+ "learning_rate": 4.7985686744976714e-06,
1886
+ "loss": 0.2846,
1887
+ "step": 246
1888
+ },
1889
+ {
1890
+ "epoch": 0.7097701149425287,
1891
+ "grad_norm": 3.9147164821624756,
1892
+ "learning_rate": 4.713107875233459e-06,
1893
+ "loss": 0.3315,
1894
+ "step": 247
1895
+ },
1896
+ {
1897
+ "epoch": 0.7126436781609196,
1898
+ "grad_norm": 3.5606236457824707,
1899
+ "learning_rate": 4.628179684199685e-06,
1900
+ "loss": 0.3101,
1901
+ "step": 248
1902
+ },
1903
+ {
1904
+ "epoch": 0.7155172413793104,
1905
+ "grad_norm": 2.9054181575775146,
1906
+ "learning_rate": 4.543792657170228e-06,
1907
+ "loss": 0.3086,
1908
+ "step": 249
1909
+ },
1910
+ {
1911
+ "epoch": 0.7183908045977011,
1912
+ "grad_norm": 2.9038889408111572,
1913
+ "learning_rate": 4.459955295401415e-06,
1914
+ "loss": 0.2979,
1915
+ "step": 250
1916
+ },
1917
+ {
1918
+ "epoch": 0.7212643678160919,
1919
+ "grad_norm": 2.952456474304199,
1920
+ "learning_rate": 4.376676044775601e-06,
1921
+ "loss": 0.3221,
1922
+ "step": 251
1923
+ },
1924
+ {
1925
+ "epoch": 0.7241379310344828,
1926
+ "grad_norm": 2.3187882900238037,
1927
+ "learning_rate": 4.293963294950313e-06,
1928
+ "loss": 0.296,
1929
+ "step": 252
1930
+ },
1931
+ {
1932
+ "epoch": 0.7270114942528736,
1933
+ "grad_norm": 4.975540637969971,
1934
+ "learning_rate": 4.211825378513066e-06,
1935
+ "loss": 0.3873,
1936
+ "step": 253
1937
+ },
1938
+ {
1939
+ "epoch": 0.7298850574712644,
1940
+ "grad_norm": 2.431337833404541,
1941
+ "learning_rate": 4.130270570141931e-06,
1942
+ "loss": 0.3386,
1943
+ "step": 254
1944
+ },
1945
+ {
1946
+ "epoch": 0.7327586206896551,
1947
+ "grad_norm": 3.258333921432495,
1948
+ "learning_rate": 4.0493070857719305e-06,
1949
+ "loss": 0.3402,
1950
+ "step": 255
1951
+ },
1952
+ {
1953
+ "epoch": 0.735632183908046,
1954
+ "grad_norm": 2.442279577255249,
1955
+ "learning_rate": 3.968943081767358e-06,
1956
+ "loss": 0.2268,
1957
+ "step": 256
1958
+ },
1959
+ {
1960
+ "epoch": 0.7385057471264368,
1961
+ "grad_norm": 3.3889667987823486,
1962
+ "learning_rate": 3.889186654100089e-06,
1963
+ "loss": 0.3008,
1964
+ "step": 257
1965
+ },
1966
+ {
1967
+ "epoch": 0.7413793103448276,
1968
+ "grad_norm": 2.3388214111328125,
1969
+ "learning_rate": 3.81004583753399e-06,
1970
+ "loss": 0.3399,
1971
+ "step": 258
1972
+ },
1973
+ {
1974
+ "epoch": 0.7442528735632183,
1975
+ "grad_norm": 2.363194704055786,
1976
+ "learning_rate": 3.7315286048154862e-06,
1977
+ "loss": 0.2797,
1978
+ "step": 259
1979
+ },
1980
+ {
1981
+ "epoch": 0.7471264367816092,
1982
+ "grad_norm": 3.6801648139953613,
1983
+ "learning_rate": 3.6536428658703594e-06,
1984
+ "loss": 0.3179,
1985
+ "step": 260
1986
+ },
1987
+ {
1988
+ "epoch": 0.7471264367816092,
1989
+ "eval_accuracy": 0.8663366336633663,
1990
+ "eval_f1": 0.7452830188679245,
1991
+ "eval_loss": 0.3128357529640198,
1992
+ "eval_precision": 0.7452830188679245,
1993
+ "eval_recall": 0.7452830188679245,
1994
+ "eval_runtime": 16.9937,
1995
+ "eval_samples_per_second": 6.238,
1996
+ "eval_steps_per_second": 0.235,
1997
+ "step": 260
1998
+ },
1999
+ {
2000
+ "epoch": 0.75,
2001
+ "grad_norm": 3.5845208168029785,
2002
+ "learning_rate": 3.576396467006925e-06,
2003
+ "loss": 0.3121,
2004
+ "step": 261
2005
+ },
2006
+ {
2007
+ "epoch": 0.7528735632183908,
2008
+ "grad_norm": 2.5115549564361572,
2009
+ "learning_rate": 3.4997971901255588e-06,
2010
+ "loss": 0.2695,
2011
+ "step": 262
2012
+ },
2013
+ {
2014
+ "epoch": 0.7557471264367817,
2015
+ "grad_norm": 3.1949312686920166,
2016
+ "learning_rate": 3.4238527519347353e-06,
2017
+ "loss": 0.29,
2018
+ "step": 263
2019
+ },
2020
+ {
2021
+ "epoch": 0.7586206896551724,
2022
+ "grad_norm": 3.134657859802246,
2023
+ "learning_rate": 3.3485708031736698e-06,
2024
+ "loss": 0.2959,
2025
+ "step": 264
2026
+ },
2027
+ {
2028
+ "epoch": 0.7614942528735632,
2029
+ "grad_norm": 2.359828472137451,
2030
+ "learning_rate": 3.2739589278415252e-06,
2031
+ "loss": 0.299,
2032
+ "step": 265
2033
+ },
2034
+ {
2035
+ "epoch": 0.764367816091954,
2036
+ "grad_norm": 2.662598133087158,
2037
+ "learning_rate": 3.2000246424334315e-06,
2038
+ "loss": 0.2887,
2039
+ "step": 266
2040
+ },
2041
+ {
2042
+ "epoch": 0.7672413793103449,
2043
+ "grad_norm": 2.822681188583374,
2044
+ "learning_rate": 3.1267753951832523e-06,
2045
+ "loss": 0.3337,
2046
+ "step": 267
2047
+ },
2048
+ {
2049
+ "epoch": 0.7701149425287356,
2050
+ "grad_norm": 3.435675859451294,
2051
+ "learning_rate": 3.0542185653132216e-06,
2052
+ "loss": 0.2431,
2053
+ "step": 268
2054
+ },
2055
+ {
2056
+ "epoch": 0.7729885057471264,
2057
+ "grad_norm": 3.8508052825927734,
2058
+ "learning_rate": 2.982361462290575e-06,
2059
+ "loss": 0.3625,
2060
+ "step": 269
2061
+ },
2062
+ {
2063
+ "epoch": 0.7758620689655172,
2064
+ "grad_norm": 3.24882435798645,
2065
+ "learning_rate": 2.9112113250911844e-06,
2066
+ "loss": 0.3255,
2067
+ "step": 270
2068
+ },
2069
+ {
2070
+ "epoch": 0.7787356321839081,
2071
+ "grad_norm": 3.215721368789673,
2072
+ "learning_rate": 2.8407753214702694e-06,
2073
+ "loss": 0.3055,
2074
+ "step": 271
2075
+ },
2076
+ {
2077
+ "epoch": 0.7816091954022989,
2078
+ "grad_norm": 3.5768065452575684,
2079
+ "learning_rate": 2.7710605472403373e-06,
2080
+ "loss": 0.2593,
2081
+ "step": 272
2082
+ },
2083
+ {
2084
+ "epoch": 0.7844827586206896,
2085
+ "grad_norm": 3.4842770099639893,
2086
+ "learning_rate": 2.702074025556327e-06,
2087
+ "loss": 0.3183,
2088
+ "step": 273
2089
+ },
2090
+ {
2091
+ "epoch": 0.7873563218390804,
2092
+ "grad_norm": 2.8685038089752197,
2093
+ "learning_rate": 2.6338227062080924e-06,
2094
+ "loss": 0.2674,
2095
+ "step": 274
2096
+ },
2097
+ {
2098
+ "epoch": 0.7902298850574713,
2099
+ "grad_norm": 3.008521318435669,
2100
+ "learning_rate": 2.566313464920265e-06,
2101
+ "loss": 0.2944,
2102
+ "step": 275
2103
+ },
2104
+ {
2105
+ "epoch": 0.7931034482758621,
2106
+ "grad_norm": 2.9339377880096436,
2107
+ "learning_rate": 2.4995531026595952e-06,
2108
+ "loss": 0.295,
2109
+ "step": 276
2110
+ },
2111
+ {
2112
+ "epoch": 0.7959770114942529,
2113
+ "grad_norm": 4.123067378997803,
2114
+ "learning_rate": 2.4335483449498053e-06,
2115
+ "loss": 0.2295,
2116
+ "step": 277
2117
+ },
2118
+ {
2119
+ "epoch": 0.7988505747126436,
2120
+ "grad_norm": 2.862365245819092,
2121
+ "learning_rate": 2.3683058411940563e-06,
2122
+ "loss": 0.2967,
2123
+ "step": 278
2124
+ },
2125
+ {
2126
+ "epoch": 0.8017241379310345,
2127
+ "grad_norm": 4.078983783721924,
2128
+ "learning_rate": 2.3038321640050763e-06,
2129
+ "loss": 0.3005,
2130
+ "step": 279
2131
+ },
2132
+ {
2133
+ "epoch": 0.8045977011494253,
2134
+ "grad_norm": 4.147453308105469,
2135
+ "learning_rate": 2.2401338085430326e-06,
2136
+ "loss": 0.2901,
2137
+ "step": 280
2138
+ },
2139
+ {
2140
+ "epoch": 0.8045977011494253,
2141
+ "eval_accuracy": 0.8638613861386139,
2142
+ "eval_f1": 0.7417840375586855,
2143
+ "eval_loss": 0.3146108090877533,
2144
+ "eval_precision": 0.7383177570093458,
2145
+ "eval_recall": 0.7452830188679245,
2146
+ "eval_runtime": 16.7331,
2147
+ "eval_samples_per_second": 6.335,
2148
+ "eval_steps_per_second": 0.239,
2149
+ "step": 280
2150
+ },
2151
+ {
2152
+ "epoch": 0.8074712643678161,
2153
+ "grad_norm": 2.4641005992889404,
2154
+ "learning_rate": 2.177217191861183e-06,
2155
+ "loss": 0.2452,
2156
+ "step": 281
2157
+ },
2158
+ {
2159
+ "epoch": 0.8103448275862069,
2160
+ "grad_norm": 3.1481075286865234,
2161
+ "learning_rate": 2.115088652259446e-06,
2162
+ "loss": 0.3355,
2163
+ "step": 282
2164
+ },
2165
+ {
2166
+ "epoch": 0.8132183908045977,
2167
+ "grad_norm": 2.2011497020721436,
2168
+ "learning_rate": 2.053754448645846e-06,
2169
+ "loss": 0.2256,
2170
+ "step": 283
2171
+ },
2172
+ {
2173
+ "epoch": 0.8160919540229885,
2174
+ "grad_norm": 3.1297502517700195,
2175
+ "learning_rate": 1.9932207599059782e-06,
2176
+ "loss": 0.2899,
2177
+ "step": 284
2178
+ },
2179
+ {
2180
+ "epoch": 0.8189655172413793,
2181
+ "grad_norm": 2.566171646118164,
2182
+ "learning_rate": 1.933493684280574e-06,
2183
+ "loss": 0.2527,
2184
+ "step": 285
2185
+ },
2186
+ {
2187
+ "epoch": 0.8218390804597702,
2188
+ "grad_norm": 3.0499560832977295,
2189
+ "learning_rate": 1.8745792387511241e-06,
2190
+ "loss": 0.2979,
2191
+ "step": 286
2192
+ },
2193
+ {
2194
+ "epoch": 0.8247126436781609,
2195
+ "grad_norm": 3.5081562995910645,
2196
+ "learning_rate": 1.8164833584337216e-06,
2197
+ "loss": 0.2766,
2198
+ "step": 287
2199
+ },
2200
+ {
2201
+ "epoch": 0.8275862068965517,
2202
+ "grad_norm": 3.2664620876312256,
2203
+ "learning_rate": 1.75921189598118e-06,
2204
+ "loss": 0.3008,
2205
+ "step": 288
2206
+ },
2207
+ {
2208
+ "epoch": 0.8304597701149425,
2209
+ "grad_norm": 3.314521551132202,
2210
+ "learning_rate": 1.7027706209933903e-06,
2211
+ "loss": 0.3387,
2212
+ "step": 289
2213
+ },
2214
+ {
2215
+ "epoch": 0.8333333333333334,
2216
+ "grad_norm": 2.5149619579315186,
2217
+ "learning_rate": 1.6471652194361131e-06,
2218
+ "loss": 0.3032,
2219
+ "step": 290
2220
+ },
2221
+ {
2222
+ "epoch": 0.8362068965517241,
2223
+ "grad_norm": 3.847849130630493,
2224
+ "learning_rate": 1.5924012930681643e-06,
2225
+ "loss": 0.3218,
2226
+ "step": 291
2227
+ },
2228
+ {
2229
+ "epoch": 0.8390804597701149,
2230
+ "grad_norm": 6.620360374450684,
2231
+ "learning_rate": 1.5384843588770626e-06,
2232
+ "loss": 0.3464,
2233
+ "step": 292
2234
+ },
2235
+ {
2236
+ "epoch": 0.8419540229885057,
2237
+ "grad_norm": 3.233356475830078,
2238
+ "learning_rate": 1.4854198485232696e-06,
2239
+ "loss": 0.2475,
2240
+ "step": 293
2241
+ },
2242
+ {
2243
+ "epoch": 0.8448275862068966,
2244
+ "grad_norm": 4.268490314483643,
2245
+ "learning_rate": 1.433213107792991e-06,
2246
+ "loss": 0.3646,
2247
+ "step": 294
2248
+ },
2249
+ {
2250
+ "epoch": 0.8477011494252874,
2251
+ "grad_norm": 3.641005754470825,
2252
+ "learning_rate": 1.3818693960596186e-06,
2253
+ "loss": 0.3347,
2254
+ "step": 295
2255
+ },
2256
+ {
2257
+ "epoch": 0.8505747126436781,
2258
+ "grad_norm": 2.945902109146118,
2259
+ "learning_rate": 1.3313938857539133e-06,
2260
+ "loss": 0.2806,
2261
+ "step": 296
2262
+ },
2263
+ {
2264
+ "epoch": 0.853448275862069,
2265
+ "grad_norm": 3.552212715148926,
2266
+ "learning_rate": 1.2817916618429194e-06,
2267
+ "loss": 0.2993,
2268
+ "step": 297
2269
+ },
2270
+ {
2271
+ "epoch": 0.8563218390804598,
2272
+ "grad_norm": 3.9987523555755615,
2273
+ "learning_rate": 1.2330677213177034e-06,
2274
+ "loss": 0.2611,
2275
+ "step": 298
2276
+ },
2277
+ {
2278
+ "epoch": 0.8591954022988506,
2279
+ "grad_norm": 4.93873405456543,
2280
+ "learning_rate": 1.1852269726899423e-06,
2281
+ "loss": 0.3131,
2282
+ "step": 299
2283
+ },
2284
+ {
2285
+ "epoch": 0.8620689655172413,
2286
+ "grad_norm": 2.6833460330963135,
2287
+ "learning_rate": 1.138274235497443e-06,
2288
+ "loss": 0.2587,
2289
+ "step": 300
2290
+ },
2291
+ {
2292
+ "epoch": 0.8620689655172413,
2293
+ "eval_accuracy": 0.8638613861386139,
2294
+ "eval_f1": 0.7417840375586855,
2295
+ "eval_loss": 0.3137795925140381,
2296
+ "eval_precision": 0.7383177570093458,
2297
+ "eval_recall": 0.7452830188679245,
2298
+ "eval_runtime": 16.8912,
2299
+ "eval_samples_per_second": 6.275,
2300
+ "eval_steps_per_second": 0.237,
2301
+ "step": 300
2302
  }
2303
  ],
2304
  "logging_steps": 1,
 
2318
  "attributes": {}
2319
  }
2320
  },
2321
+ "total_flos": 9.46781682579538e+16,
2322
  "train_batch_size": 8,
2323
  "trial_name": null,
2324
  "trial_params": null