romainnn commited on
Commit
62a08b1
·
verified ·
1 Parent(s): 40d1b34

Training in progress, step 291, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a345c181506a3736c89ebe1e76c24ed9808d75b880ba78f1e490c9285303307
3
  size 200068512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d559c507527408151305de60bfac7ecab7fdc8cec07a0a100cf95fa50b736969
3
  size 200068512
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b00781979e5718df8a0ecd8887aabfc9b2f529fb29fc12545551cf26a0e7e07e
3
- size 102030196
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13ca1ecb806bdfef2a06cb71be0add3a97cea5049a108463e73c7b0210802d2a
3
+ size 102030580
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c52129480a4e1f654388f2b5c630b1df2103a8d90771ca1686c3cc2eb8f690b7
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56b62f440272694c6b2ce6984e703f343f25272dbe6369c607d93c46957b1d63
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1eb20d9a4842f45ebf67618ccfa4c674d5289be40e6a84d29fe1d5d499c01762
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb8ca612c507917e163ce09dc4e7e572d1fec912d9e349f29fc7ce0c23b3101f
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 0.003245619358494878,
3
  "best_model_checkpoint": "miner_id_24/checkpoint-200",
4
- "epoch": 1.3745704467353952,
5
  "eval_steps": 100,
6
- "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1431,6 +1431,643 @@
1431
  "eval_samples_per_second": 10.861,
1432
  "eval_steps_per_second": 2.749,
1433
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1434
  }
1435
  ],
1436
  "logging_steps": 1,
@@ -1454,12 +2091,12 @@
1454
  "should_evaluate": false,
1455
  "should_log": false,
1456
  "should_save": true,
1457
- "should_training_stop": false
1458
  },
1459
  "attributes": {}
1460
  }
1461
  },
1462
- "total_flos": 3.06709813719466e+17,
1463
  "train_batch_size": 4,
1464
  "trial_name": null,
1465
  "trial_params": null
 
1
  {
2
  "best_metric": 0.003245619358494878,
3
  "best_model_checkpoint": "miner_id_24/checkpoint-200",
4
+ "epoch": 2.0,
5
  "eval_steps": 100,
6
+ "global_step": 291,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1431
  "eval_samples_per_second": 10.861,
1432
  "eval_steps_per_second": 2.749,
1433
  "step": 200
1434
+ },
1435
+ {
1436
+ "epoch": 1.3814432989690721,
1437
+ "grad_norm": 0.1602359563112259,
1438
+ "learning_rate": 4.649285891779327e-05,
1439
+ "loss": 0.0016,
1440
+ "step": 201
1441
+ },
1442
+ {
1443
+ "epoch": 1.388316151202749,
1444
+ "grad_norm": 0.07345937192440033,
1445
+ "learning_rate": 4.555172473510329e-05,
1446
+ "loss": 0.0019,
1447
+ "step": 202
1448
+ },
1449
+ {
1450
+ "epoch": 1.395189003436426,
1451
+ "grad_norm": 0.1161644235253334,
1452
+ "learning_rate": 4.461739615694929e-05,
1453
+ "loss": 0.0015,
1454
+ "step": 203
1455
+ },
1456
+ {
1457
+ "epoch": 1.402061855670103,
1458
+ "grad_norm": 0.06747262179851532,
1459
+ "learning_rate": 4.3689989967026935e-05,
1460
+ "loss": 0.0039,
1461
+ "step": 204
1462
+ },
1463
+ {
1464
+ "epoch": 1.40893470790378,
1465
+ "grad_norm": 0.14562109112739563,
1466
+ "learning_rate": 4.276962208378811e-05,
1467
+ "loss": 0.0029,
1468
+ "step": 205
1469
+ },
1470
+ {
1471
+ "epoch": 1.4158075601374571,
1472
+ "grad_norm": 0.0071322242729365826,
1473
+ "learning_rate": 4.1856407545951834e-05,
1474
+ "loss": 0.0001,
1475
+ "step": 206
1476
+ },
1477
+ {
1478
+ "epoch": 1.422680412371134,
1479
+ "grad_norm": 0.07651171833276749,
1480
+ "learning_rate": 4.095046049812545e-05,
1481
+ "loss": 0.0045,
1482
+ "step": 207
1483
+ },
1484
+ {
1485
+ "epoch": 1.429553264604811,
1486
+ "grad_norm": 0.15785658359527588,
1487
+ "learning_rate": 4.005189417653743e-05,
1488
+ "loss": 0.001,
1489
+ "step": 208
1490
+ },
1491
+ {
1492
+ "epoch": 1.436426116838488,
1493
+ "grad_norm": 0.09830784052610397,
1494
+ "learning_rate": 3.916082089488372e-05,
1495
+ "loss": 0.0036,
1496
+ "step": 209
1497
+ },
1498
+ {
1499
+ "epoch": 1.443298969072165,
1500
+ "grad_norm": 0.18310892581939697,
1501
+ "learning_rate": 3.827735203028953e-05,
1502
+ "loss": 0.0053,
1503
+ "step": 210
1504
+ },
1505
+ {
1506
+ "epoch": 1.4501718213058419,
1507
+ "grad_norm": 0.15072131156921387,
1508
+ "learning_rate": 3.740159800938784e-05,
1509
+ "loss": 0.0012,
1510
+ "step": 211
1511
+ },
1512
+ {
1513
+ "epoch": 1.4570446735395188,
1514
+ "grad_norm": 0.14942792057991028,
1515
+ "learning_rate": 3.6533668294517155e-05,
1516
+ "loss": 0.0029,
1517
+ "step": 212
1518
+ },
1519
+ {
1520
+ "epoch": 1.463917525773196,
1521
+ "grad_norm": 0.04605146124958992,
1522
+ "learning_rate": 3.5673671370039466e-05,
1523
+ "loss": 0.0019,
1524
+ "step": 213
1525
+ },
1526
+ {
1527
+ "epoch": 1.470790378006873,
1528
+ "grad_norm": 0.09986945241689682,
1529
+ "learning_rate": 3.482171472878062e-05,
1530
+ "loss": 0.0023,
1531
+ "step": 214
1532
+ },
1533
+ {
1534
+ "epoch": 1.47766323024055,
1535
+ "grad_norm": 0.06329240649938583,
1536
+ "learning_rate": 3.39779048585945e-05,
1537
+ "loss": 0.0009,
1538
+ "step": 215
1539
+ },
1540
+ {
1541
+ "epoch": 1.4845360824742269,
1542
+ "grad_norm": 0.04497532546520233,
1543
+ "learning_rate": 3.3142347229053015e-05,
1544
+ "loss": 0.0016,
1545
+ "step": 216
1546
+ },
1547
+ {
1548
+ "epoch": 1.4914089347079038,
1549
+ "grad_norm": 0.05954873189330101,
1550
+ "learning_rate": 3.231514627826305e-05,
1551
+ "loss": 0.0016,
1552
+ "step": 217
1553
+ },
1554
+ {
1555
+ "epoch": 1.4982817869415808,
1556
+ "grad_norm": 0.11368856579065323,
1557
+ "learning_rate": 3.149640539981267e-05,
1558
+ "loss": 0.0042,
1559
+ "step": 218
1560
+ },
1561
+ {
1562
+ "epoch": 1.5051546391752577,
1563
+ "grad_norm": 0.0854845941066742,
1564
+ "learning_rate": 3.068622692984762e-05,
1565
+ "loss": 0.0048,
1566
+ "step": 219
1567
+ },
1568
+ {
1569
+ "epoch": 1.5120274914089347,
1570
+ "grad_norm": 0.11908043920993805,
1571
+ "learning_rate": 2.9884712134280324e-05,
1572
+ "loss": 0.004,
1573
+ "step": 220
1574
+ },
1575
+ {
1576
+ "epoch": 1.5189003436426116,
1577
+ "grad_norm": 0.13990691304206848,
1578
+ "learning_rate": 2.909196119613218e-05,
1579
+ "loss": 0.0047,
1580
+ "step": 221
1581
+ },
1582
+ {
1583
+ "epoch": 1.5257731958762886,
1584
+ "grad_norm": 0.08153674751520157,
1585
+ "learning_rate": 2.8308073203011663e-05,
1586
+ "loss": 0.0051,
1587
+ "step": 222
1588
+ },
1589
+ {
1590
+ "epoch": 1.5326460481099655,
1591
+ "grad_norm": 0.06916696578264236,
1592
+ "learning_rate": 2.753314613472906e-05,
1593
+ "loss": 0.0022,
1594
+ "step": 223
1595
+ },
1596
+ {
1597
+ "epoch": 1.5395189003436425,
1598
+ "grad_norm": 0.0983443409204483,
1599
+ "learning_rate": 2.6767276851049816e-05,
1600
+ "loss": 0.0037,
1601
+ "step": 224
1602
+ },
1603
+ {
1604
+ "epoch": 1.5463917525773194,
1605
+ "grad_norm": 0.18421703577041626,
1606
+ "learning_rate": 2.6010561079587813e-05,
1607
+ "loss": 0.0017,
1608
+ "step": 225
1609
+ },
1610
+ {
1611
+ "epoch": 1.5532646048109966,
1612
+ "grad_norm": 0.15690971910953522,
1613
+ "learning_rate": 2.5263093403840142e-05,
1614
+ "loss": 0.0024,
1615
+ "step": 226
1616
+ },
1617
+ {
1618
+ "epoch": 1.5601374570446735,
1619
+ "grad_norm": 0.06236126646399498,
1620
+ "learning_rate": 2.4524967251365026e-05,
1621
+ "loss": 0.0022,
1622
+ "step": 227
1623
+ },
1624
+ {
1625
+ "epoch": 1.5670103092783505,
1626
+ "grad_norm": 0.1433926373720169,
1627
+ "learning_rate": 2.3796274882103964e-05,
1628
+ "loss": 0.0052,
1629
+ "step": 228
1630
+ },
1631
+ {
1632
+ "epoch": 1.5738831615120275,
1633
+ "grad_norm": 0.15132726728916168,
1634
+ "learning_rate": 2.3077107376850005e-05,
1635
+ "loss": 0.0059,
1636
+ "step": 229
1637
+ },
1638
+ {
1639
+ "epoch": 1.5807560137457046,
1640
+ "grad_norm": 0.06804714351892471,
1641
+ "learning_rate": 2.2367554625863497e-05,
1642
+ "loss": 0.0026,
1643
+ "step": 230
1644
+ },
1645
+ {
1646
+ "epoch": 1.5876288659793816,
1647
+ "grad_norm": 0.12087640911340714,
1648
+ "learning_rate": 2.166770531763633e-05,
1649
+ "loss": 0.0035,
1650
+ "step": 231
1651
+ },
1652
+ {
1653
+ "epoch": 1.5945017182130585,
1654
+ "grad_norm": 0.22184179723262787,
1655
+ "learning_rate": 2.0977646927806683e-05,
1656
+ "loss": 0.0034,
1657
+ "step": 232
1658
+ },
1659
+ {
1660
+ "epoch": 1.6013745704467355,
1661
+ "grad_norm": 0.08892931044101715,
1662
+ "learning_rate": 2.0297465708225238e-05,
1663
+ "loss": 0.0025,
1664
+ "step": 233
1665
+ },
1666
+ {
1667
+ "epoch": 1.6082474226804124,
1668
+ "grad_norm": 0.09320689737796783,
1669
+ "learning_rate": 1.962724667617436e-05,
1670
+ "loss": 0.0043,
1671
+ "step": 234
1672
+ },
1673
+ {
1674
+ "epoch": 1.6151202749140894,
1675
+ "grad_norm": 0.18926142156124115,
1676
+ "learning_rate": 1.896707360374167e-05,
1677
+ "loss": 0.0077,
1678
+ "step": 235
1679
+ },
1680
+ {
1681
+ "epoch": 1.6219931271477663,
1682
+ "grad_norm": 0.07618946582078934,
1683
+ "learning_rate": 1.8317029007349085e-05,
1684
+ "loss": 0.0024,
1685
+ "step": 236
1686
+ },
1687
+ {
1688
+ "epoch": 1.6288659793814433,
1689
+ "grad_norm": 0.023058777675032616,
1690
+ "learning_rate": 1.7677194137439035e-05,
1691
+ "loss": 0.0005,
1692
+ "step": 237
1693
+ },
1694
+ {
1695
+ "epoch": 1.6357388316151202,
1696
+ "grad_norm": 0.08784357458353043,
1697
+ "learning_rate": 1.7047648968318698e-05,
1698
+ "loss": 0.0064,
1699
+ "step": 238
1700
+ },
1701
+ {
1702
+ "epoch": 1.6426116838487972,
1703
+ "grad_norm": 0.032160189002752304,
1704
+ "learning_rate": 1.642847218816398e-05,
1705
+ "loss": 0.001,
1706
+ "step": 239
1707
+ },
1708
+ {
1709
+ "epoch": 1.6494845360824741,
1710
+ "grad_norm": 0.043870214372873306,
1711
+ "learning_rate": 1.58197411891839e-05,
1712
+ "loss": 0.0013,
1713
+ "step": 240
1714
+ },
1715
+ {
1716
+ "epoch": 1.656357388316151,
1717
+ "grad_norm": 0.08430828154087067,
1718
+ "learning_rate": 1.5221532057947419e-05,
1719
+ "loss": 0.0008,
1720
+ "step": 241
1721
+ },
1722
+ {
1723
+ "epoch": 1.663230240549828,
1724
+ "grad_norm": 0.0656968504190445,
1725
+ "learning_rate": 1.4633919565873033e-05,
1726
+ "loss": 0.0012,
1727
+ "step": 242
1728
+ },
1729
+ {
1730
+ "epoch": 1.670103092783505,
1731
+ "grad_norm": 0.26261475682258606,
1732
+ "learning_rate": 1.4056977159883012e-05,
1733
+ "loss": 0.0033,
1734
+ "step": 243
1735
+ },
1736
+ {
1737
+ "epoch": 1.6769759450171822,
1738
+ "grad_norm": 0.08948194980621338,
1739
+ "learning_rate": 1.3490776953223105e-05,
1740
+ "loss": 0.0028,
1741
+ "step": 244
1742
+ },
1743
+ {
1744
+ "epoch": 1.6838487972508591,
1745
+ "grad_norm": 0.07044168561697006,
1746
+ "learning_rate": 1.2935389716448976e-05,
1747
+ "loss": 0.0022,
1748
+ "step": 245
1749
+ },
1750
+ {
1751
+ "epoch": 1.690721649484536,
1752
+ "grad_norm": 0.08131295442581177,
1753
+ "learning_rate": 1.23908848685804e-05,
1754
+ "loss": 0.002,
1755
+ "step": 246
1756
+ },
1757
+ {
1758
+ "epoch": 1.697594501718213,
1759
+ "grad_norm": 0.1506740152835846,
1760
+ "learning_rate": 1.1857330468424466e-05,
1761
+ "loss": 0.0035,
1762
+ "step": 247
1763
+ },
1764
+ {
1765
+ "epoch": 1.7044673539518902,
1766
+ "grad_norm": 0.045038871467113495,
1767
+ "learning_rate": 1.133479320606874e-05,
1768
+ "loss": 0.0019,
1769
+ "step": 248
1770
+ },
1771
+ {
1772
+ "epoch": 1.7113402061855671,
1773
+ "grad_norm": 0.14267544448375702,
1774
+ "learning_rate": 1.082333839454559e-05,
1775
+ "loss": 0.0047,
1776
+ "step": 249
1777
+ },
1778
+ {
1779
+ "epoch": 1.718213058419244,
1780
+ "grad_norm": 0.11678794771432877,
1781
+ "learning_rate": 1.0323029961668462e-05,
1782
+ "loss": 0.002,
1783
+ "step": 250
1784
+ },
1785
+ {
1786
+ "epoch": 1.725085910652921,
1787
+ "grad_norm": 0.04625638574361801,
1788
+ "learning_rate": 9.833930442041506e-06,
1789
+ "loss": 0.0022,
1790
+ "step": 251
1791
+ },
1792
+ {
1793
+ "epoch": 1.731958762886598,
1794
+ "grad_norm": 0.13576196134090424,
1795
+ "learning_rate": 9.35610096924323e-06,
1796
+ "loss": 0.0022,
1797
+ "step": 252
1798
+ },
1799
+ {
1800
+ "epoch": 1.738831615120275,
1801
+ "grad_norm": 0.10366480052471161,
1802
+ "learning_rate": 8.889601268185232e-06,
1803
+ "loss": 0.0055,
1804
+ "step": 253
1805
+ },
1806
+ {
1807
+ "epoch": 1.745704467353952,
1808
+ "grad_norm": 0.04077526181936264,
1809
+ "learning_rate": 8.434489647647092e-06,
1810
+ "loss": 0.0017,
1811
+ "step": 254
1812
+ },
1813
+ {
1814
+ "epoch": 1.7525773195876289,
1815
+ "grad_norm": 0.11092139035463333,
1816
+ "learning_rate": 7.990822992988267e-06,
1817
+ "loss": 0.0034,
1818
+ "step": 255
1819
+ },
1820
+ {
1821
+ "epoch": 1.7594501718213058,
1822
+ "grad_norm": 0.069175124168396,
1823
+ "learning_rate": 7.558656759037797e-06,
1824
+ "loss": 0.0026,
1825
+ "step": 256
1826
+ },
1827
+ {
1828
+ "epoch": 1.7663230240549828,
1829
+ "grad_norm": 0.12054812163114548,
1830
+ "learning_rate": 7.13804496316296e-06,
1831
+ "loss": 0.0024,
1832
+ "step": 257
1833
+ },
1834
+ {
1835
+ "epoch": 1.7731958762886597,
1836
+ "grad_norm": 0.03802892938256264,
1837
+ "learning_rate": 6.729040178517454e-06,
1838
+ "loss": 0.0011,
1839
+ "step": 258
1840
+ },
1841
+ {
1842
+ "epoch": 1.7800687285223367,
1843
+ "grad_norm": 0.031027084216475487,
1844
+ "learning_rate": 6.331693527470306e-06,
1845
+ "loss": 0.0013,
1846
+ "step": 259
1847
+ },
1848
+ {
1849
+ "epoch": 1.7869415807560136,
1850
+ "grad_norm": 0.20980218052864075,
1851
+ "learning_rate": 5.946054675215784e-06,
1852
+ "loss": 0.0011,
1853
+ "step": 260
1854
+ },
1855
+ {
1856
+ "epoch": 1.7938144329896906,
1857
+ "grad_norm": 0.1241433396935463,
1858
+ "learning_rate": 5.572171823565797e-06,
1859
+ "loss": 0.0011,
1860
+ "step": 261
1861
+ },
1862
+ {
1863
+ "epoch": 1.8006872852233677,
1864
+ "grad_norm": 0.10405414551496506,
1865
+ "learning_rate": 5.210091704924946e-06,
1866
+ "loss": 0.0063,
1867
+ "step": 262
1868
+ },
1869
+ {
1870
+ "epoch": 1.8075601374570447,
1871
+ "grad_norm": 0.019170010462403297,
1872
+ "learning_rate": 4.859859576449444e-06,
1873
+ "loss": 0.0002,
1874
+ "step": 263
1875
+ },
1876
+ {
1877
+ "epoch": 1.8144329896907216,
1878
+ "grad_norm": 0.10227034986019135,
1879
+ "learning_rate": 4.521519214390257e-06,
1880
+ "loss": 0.0024,
1881
+ "step": 264
1882
+ },
1883
+ {
1884
+ "epoch": 1.8213058419243986,
1885
+ "grad_norm": 0.06309763342142105,
1886
+ "learning_rate": 4.195112908621402e-06,
1887
+ "loss": 0.002,
1888
+ "step": 265
1889
+ },
1890
+ {
1891
+ "epoch": 1.8281786941580758,
1892
+ "grad_norm": 0.11718452721834183,
1893
+ "learning_rate": 3.880681457354118e-06,
1894
+ "loss": 0.0011,
1895
+ "step": 266
1896
+ },
1897
+ {
1898
+ "epoch": 1.8350515463917527,
1899
+ "grad_norm": 0.04819872975349426,
1900
+ "learning_rate": 3.578264162037348e-06,
1901
+ "loss": 0.003,
1902
+ "step": 267
1903
+ },
1904
+ {
1905
+ "epoch": 1.8419243986254297,
1906
+ "grad_norm": 0.0733291283249855,
1907
+ "learning_rate": 3.2878988224454344e-06,
1908
+ "loss": 0.0016,
1909
+ "step": 268
1910
+ },
1911
+ {
1912
+ "epoch": 1.8487972508591066,
1913
+ "grad_norm": 0.1850636601448059,
1914
+ "learning_rate": 3.0096217319533382e-06,
1915
+ "loss": 0.0032,
1916
+ "step": 269
1917
+ },
1918
+ {
1919
+ "epoch": 1.8556701030927836,
1920
+ "grad_norm": 0.40491941571235657,
1921
+ "learning_rate": 2.7434676730003884e-06,
1922
+ "loss": 0.004,
1923
+ "step": 270
1924
+ },
1925
+ {
1926
+ "epoch": 1.8625429553264605,
1927
+ "grad_norm": 0.08381707221269608,
1928
+ "learning_rate": 2.4894699127426367e-06,
1929
+ "loss": 0.0018,
1930
+ "step": 271
1931
+ },
1932
+ {
1933
+ "epoch": 1.8694158075601375,
1934
+ "grad_norm": 0.06890317052602768,
1935
+ "learning_rate": 2.2476601988947966e-06,
1936
+ "loss": 0.0026,
1937
+ "step": 272
1938
+ },
1939
+ {
1940
+ "epoch": 1.8762886597938144,
1941
+ "grad_norm": 0.15883168578147888,
1942
+ "learning_rate": 2.0180687557619816e-06,
1943
+ "loss": 0.0034,
1944
+ "step": 273
1945
+ },
1946
+ {
1947
+ "epoch": 1.8831615120274914,
1948
+ "grad_norm": 0.06412586569786072,
1949
+ "learning_rate": 1.8007242804619628e-06,
1950
+ "loss": 0.0014,
1951
+ "step": 274
1952
+ },
1953
+ {
1954
+ "epoch": 1.8900343642611683,
1955
+ "grad_norm": 0.07986130565404892,
1956
+ "learning_rate": 1.595653939338204e-06,
1957
+ "loss": 0.0015,
1958
+ "step": 275
1959
+ },
1960
+ {
1961
+ "epoch": 1.8969072164948453,
1962
+ "grad_norm": 0.07344045490026474,
1963
+ "learning_rate": 1.4028833645643113e-06,
1964
+ "loss": 0.0011,
1965
+ "step": 276
1966
+ },
1967
+ {
1968
+ "epoch": 1.9037800687285222,
1969
+ "grad_norm": 0.1562926471233368,
1970
+ "learning_rate": 1.222436650940173e-06,
1971
+ "loss": 0.0092,
1972
+ "step": 277
1973
+ },
1974
+ {
1975
+ "epoch": 1.9106529209621992,
1976
+ "grad_norm": 0.11137005686759949,
1977
+ "learning_rate": 1.0543363528803696e-06,
1978
+ "loss": 0.0027,
1979
+ "step": 278
1980
+ },
1981
+ {
1982
+ "epoch": 1.9175257731958761,
1983
+ "grad_norm": 0.1020520031452179,
1984
+ "learning_rate": 8.986034815950172e-07,
1985
+ "loss": 0.0054,
1986
+ "step": 279
1987
+ },
1988
+ {
1989
+ "epoch": 1.9243986254295533,
1990
+ "grad_norm": 0.044514842331409454,
1991
+ "learning_rate": 7.552575024634689e-07,
1992
+ "loss": 0.0016,
1993
+ "step": 280
1994
+ },
1995
+ {
1996
+ "epoch": 1.9312714776632303,
1997
+ "grad_norm": 0.05561070516705513,
1998
+ "learning_rate": 6.243163326014267e-07,
1999
+ "loss": 0.0022,
2000
+ "step": 281
2001
+ },
2002
+ {
2003
+ "epoch": 1.9381443298969072,
2004
+ "grad_norm": 0.22467757761478424,
2005
+ "learning_rate": 5.057963386213116e-07,
2006
+ "loss": 0.0035,
2007
+ "step": 282
2008
+ },
2009
+ {
2010
+ "epoch": 1.9450171821305842,
2011
+ "grad_norm": 0.08855283260345459,
2012
+ "learning_rate": 3.9971233458665493e-07,
2013
+ "loss": 0.0023,
2014
+ "step": 283
2015
+ },
2016
+ {
2017
+ "epoch": 1.9518900343642611,
2018
+ "grad_norm": 0.14920522272586823,
2019
+ "learning_rate": 3.060775801604354e-07,
2020
+ "loss": 0.0061,
2021
+ "step": 284
2022
+ },
2023
+ {
2024
+ "epoch": 1.9587628865979383,
2025
+ "grad_norm": 0.11901390552520752,
2026
+ "learning_rate": 2.2490377894768267e-07,
2027
+ "loss": 0.0055,
2028
+ "step": 285
2029
+ },
2030
+ {
2031
+ "epoch": 1.9656357388316152,
2032
+ "grad_norm": 0.07598073780536652,
2033
+ "learning_rate": 1.562010770326916e-07,
2034
+ "loss": 0.0036,
2035
+ "step": 286
2036
+ },
2037
+ {
2038
+ "epoch": 1.9725085910652922,
2039
+ "grad_norm": 0.1735997498035431,
2040
+ "learning_rate": 9.99780617107815e-08,
2041
+ "loss": 0.0038,
2042
+ "step": 287
2043
+ },
2044
+ {
2045
+ "epoch": 1.9793814432989691,
2046
+ "grad_norm": 0.1865747720003128,
2047
+ "learning_rate": 5.6241760414987856e-08,
2048
+ "loss": 0.007,
2049
+ "step": 288
2050
+ },
2051
+ {
2052
+ "epoch": 1.986254295532646,
2053
+ "grad_norm": 0.05057632550597191,
2054
+ "learning_rate": 2.4997639837687213e-08,
2055
+ "loss": 0.0032,
2056
+ "step": 289
2057
+ },
2058
+ {
2059
+ "epoch": 1.993127147766323,
2060
+ "grad_norm": 0.13223591446876526,
2061
+ "learning_rate": 6.2496052472549304e-09,
2062
+ "loss": 0.0037,
2063
+ "step": 290
2064
+ },
2065
+ {
2066
+ "epoch": 2.0,
2067
+ "grad_norm": 0.31574028730392456,
2068
+ "learning_rate": 0.0,
2069
+ "loss": 0.0051,
2070
+ "step": 291
2071
  }
2072
  ],
2073
  "logging_steps": 1,
 
2091
  "should_evaluate": false,
2092
  "should_log": false,
2093
  "should_save": true,
2094
+ "should_training_stop": true
2095
  },
2096
  "attributes": {}
2097
  }
2098
  },
2099
+ "total_flos": 4.4623665662263296e+17,
2100
  "train_batch_size": 4,
2101
  "trial_name": null,
2102
  "trial_params": null