diaenra commited on
Commit
22478bd
·
verified ·
1 Parent(s): f1d840d

Training in progress, step 436, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:879fa67154bf0aca42b4b9242a8ff58cd002cdfcc474eed0e4463de438e26495
3
  size 614968888
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3eabb358380da88f3e2eb2384a4f4f8f13aedb6194ee219a6d4e44932060672
3
  size 614968888
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4ed0e47ca4dca034e5f010a9293d2d53ccfd138ceba532640b6414ca67942b45
3
  size 1230132424
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d1d80a8bc8236676e6232fb2db29ec0a42aa6330168f2aaf72934c21e75f0d2
3
  size 1230132424
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:52edada40d769d2b71224fc0f5955b314d74221f3f14de36e52f8dd0922c11b7
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3d44e1ca4f3d5f0b30777072074e21fca5cb6c135bd86fac71f54a60249896c
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:50f424e719b7520a943d5504702f571438803b0c98ea72e695e69445f08a4aeb
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4afd94ddd3630727d7e4646bbec3048ec7f0e8b826d6c96ce73546e36a1353bf
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5481651376146789,
5
  "eval_steps": 500,
6
- "global_step": 239,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1680,6 +1680,1385 @@
1680
  "learning_rate": 6.339194928502517e-05,
1681
  "loss": 1.9865,
1682
  "step": 239
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1683
  }
1684
  ],
1685
  "logging_steps": 1,
@@ -1694,12 +3073,12 @@
1694
  "should_evaluate": false,
1695
  "should_log": false,
1696
  "should_save": true,
1697
- "should_training_stop": false
1698
  },
1699
  "attributes": {}
1700
  }
1701
  },
1702
- "total_flos": 1.1328992742211584e+16,
1703
  "train_batch_size": 4,
1704
  "trial_name": null,
1705
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 436,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1680
  "learning_rate": 6.339194928502517e-05,
1681
  "loss": 1.9865,
1682
  "step": 239
1683
+ },
1684
+ {
1685
+ "epoch": 0.5504587155963303,
1686
+ "grad_norm": 8.680039405822754,
1687
+ "learning_rate": 6.294095225512603e-05,
1688
+ "loss": 0.8176,
1689
+ "step": 240
1690
+ },
1691
+ {
1692
+ "epoch": 0.5527522935779816,
1693
+ "grad_norm": 21.01581382751465,
1694
+ "learning_rate": 6.248882390836135e-05,
1695
+ "loss": 0.7879,
1696
+ "step": 241
1697
+ },
1698
+ {
1699
+ "epoch": 0.555045871559633,
1700
+ "grad_norm": 10.070744514465332,
1701
+ "learning_rate": 6.203560377044866e-05,
1702
+ "loss": 0.8343,
1703
+ "step": 242
1704
+ },
1705
+ {
1706
+ "epoch": 0.5573394495412844,
1707
+ "grad_norm": 3.3140993118286133,
1708
+ "learning_rate": 6.158133146255153e-05,
1709
+ "loss": 0.4982,
1710
+ "step": 243
1711
+ },
1712
+ {
1713
+ "epoch": 0.5596330275229358,
1714
+ "grad_norm": 5.305646896362305,
1715
+ "learning_rate": 6.112604669781572e-05,
1716
+ "loss": 0.6298,
1717
+ "step": 244
1718
+ },
1719
+ {
1720
+ "epoch": 0.5619266055045872,
1721
+ "grad_norm": 3.5618624687194824,
1722
+ "learning_rate": 6.0669789277897507e-05,
1723
+ "loss": 0.4889,
1724
+ "step": 245
1725
+ },
1726
+ {
1727
+ "epoch": 0.5642201834862385,
1728
+ "grad_norm": 4.08172082901001,
1729
+ "learning_rate": 6.021259908948402e-05,
1730
+ "loss": 0.5831,
1731
+ "step": 246
1732
+ },
1733
+ {
1734
+ "epoch": 0.5665137614678899,
1735
+ "grad_norm": 3.6568925380706787,
1736
+ "learning_rate": 5.9754516100806423e-05,
1737
+ "loss": 0.6037,
1738
+ "step": 247
1739
+ },
1740
+ {
1741
+ "epoch": 0.5688073394495413,
1742
+ "grad_norm": 3.1750009059906006,
1743
+ "learning_rate": 5.9295580358145744e-05,
1744
+ "loss": 0.6221,
1745
+ "step": 248
1746
+ },
1747
+ {
1748
+ "epoch": 0.5711009174311926,
1749
+ "grad_norm": 3.9468281269073486,
1750
+ "learning_rate": 5.8835831982332015e-05,
1751
+ "loss": 0.6358,
1752
+ "step": 249
1753
+ },
1754
+ {
1755
+ "epoch": 0.573394495412844,
1756
+ "grad_norm": 5.328709602355957,
1757
+ "learning_rate": 5.837531116523682e-05,
1758
+ "loss": 0.7811,
1759
+ "step": 250
1760
+ },
1761
+ {
1762
+ "epoch": 0.5756880733944955,
1763
+ "grad_norm": 1.4755908250808716,
1764
+ "learning_rate": 5.791405816625975e-05,
1765
+ "loss": 0.5937,
1766
+ "step": 251
1767
+ },
1768
+ {
1769
+ "epoch": 0.5779816513761468,
1770
+ "grad_norm": 1.5495307445526123,
1771
+ "learning_rate": 5.745211330880872e-05,
1772
+ "loss": 0.589,
1773
+ "step": 252
1774
+ },
1775
+ {
1776
+ "epoch": 0.5802752293577982,
1777
+ "grad_norm": 1.6908055543899536,
1778
+ "learning_rate": 5.698951697677498e-05,
1779
+ "loss": 0.6837,
1780
+ "step": 253
1781
+ },
1782
+ {
1783
+ "epoch": 0.5825688073394495,
1784
+ "grad_norm": 1.8224513530731201,
1785
+ "learning_rate": 5.6526309611002594e-05,
1786
+ "loss": 0.9065,
1787
+ "step": 254
1788
+ },
1789
+ {
1790
+ "epoch": 0.5848623853211009,
1791
+ "grad_norm": 1.8069437742233276,
1792
+ "learning_rate": 5.6062531705753075e-05,
1793
+ "loss": 0.7501,
1794
+ "step": 255
1795
+ },
1796
+ {
1797
+ "epoch": 0.5871559633027523,
1798
+ "grad_norm": 2.2407689094543457,
1799
+ "learning_rate": 5.559822380516539e-05,
1800
+ "loss": 0.9369,
1801
+ "step": 256
1802
+ },
1803
+ {
1804
+ "epoch": 0.5894495412844036,
1805
+ "grad_norm": 2.168276071548462,
1806
+ "learning_rate": 5.5133426499711425e-05,
1807
+ "loss": 1.0019,
1808
+ "step": 257
1809
+ },
1810
+ {
1811
+ "epoch": 0.591743119266055,
1812
+ "grad_norm": 3.047189474105835,
1813
+ "learning_rate": 5.466818042264753e-05,
1814
+ "loss": 1.3919,
1815
+ "step": 258
1816
+ },
1817
+ {
1818
+ "epoch": 0.5940366972477065,
1819
+ "grad_norm": 2.7798616886138916,
1820
+ "learning_rate": 5.420252624646238e-05,
1821
+ "loss": 1.1665,
1822
+ "step": 259
1823
+ },
1824
+ {
1825
+ "epoch": 0.5963302752293578,
1826
+ "grad_norm": 3.3886473178863525,
1827
+ "learning_rate": 5.373650467932122e-05,
1828
+ "loss": 1.6027,
1829
+ "step": 260
1830
+ },
1831
+ {
1832
+ "epoch": 0.5986238532110092,
1833
+ "grad_norm": 3.44380784034729,
1834
+ "learning_rate": 5.327015646150716e-05,
1835
+ "loss": 1.3152,
1836
+ "step": 261
1837
+ },
1838
+ {
1839
+ "epoch": 0.6009174311926605,
1840
+ "grad_norm": 3.924863338470459,
1841
+ "learning_rate": 5.2803522361859594e-05,
1842
+ "loss": 1.2854,
1843
+ "step": 262
1844
+ },
1845
+ {
1846
+ "epoch": 0.6032110091743119,
1847
+ "grad_norm": 3.6491658687591553,
1848
+ "learning_rate": 5.233664317421012e-05,
1849
+ "loss": 1.0868,
1850
+ "step": 263
1851
+ },
1852
+ {
1853
+ "epoch": 0.6055045871559633,
1854
+ "grad_norm": 4.161890983581543,
1855
+ "learning_rate": 5.18695597138163e-05,
1856
+ "loss": 1.4397,
1857
+ "step": 264
1858
+ },
1859
+ {
1860
+ "epoch": 0.6077981651376146,
1861
+ "grad_norm": 5.066677570343018,
1862
+ "learning_rate": 5.140231281379345e-05,
1863
+ "loss": 1.5409,
1864
+ "step": 265
1865
+ },
1866
+ {
1867
+ "epoch": 0.6100917431192661,
1868
+ "grad_norm": 5.091641426086426,
1869
+ "learning_rate": 5.0934943321545115e-05,
1870
+ "loss": 1.6991,
1871
+ "step": 266
1872
+ },
1873
+ {
1874
+ "epoch": 0.6123853211009175,
1875
+ "grad_norm": 6.486395835876465,
1876
+ "learning_rate": 5.046749209519197e-05,
1877
+ "loss": 1.7857,
1878
+ "step": 267
1879
+ },
1880
+ {
1881
+ "epoch": 0.6146788990825688,
1882
+ "grad_norm": 9.023844718933105,
1883
+ "learning_rate": 5e-05,
1884
+ "loss": 2.5128,
1885
+ "step": 268
1886
+ },
1887
+ {
1888
+ "epoch": 0.6169724770642202,
1889
+ "grad_norm": 6.766034126281738,
1890
+ "learning_rate": 4.953250790480805e-05,
1891
+ "loss": 1.9218,
1892
+ "step": 269
1893
+ },
1894
+ {
1895
+ "epoch": 0.6192660550458715,
1896
+ "grad_norm": 8.038894653320312,
1897
+ "learning_rate": 4.9065056678454904e-05,
1898
+ "loss": 2.2512,
1899
+ "step": 270
1900
+ },
1901
+ {
1902
+ "epoch": 0.6215596330275229,
1903
+ "grad_norm": 7.6417365074157715,
1904
+ "learning_rate": 4.859768718620656e-05,
1905
+ "loss": 2.4227,
1906
+ "step": 271
1907
+ },
1908
+ {
1909
+ "epoch": 0.6238532110091743,
1910
+ "grad_norm": 7.12877082824707,
1911
+ "learning_rate": 4.813044028618373e-05,
1912
+ "loss": 2.0528,
1913
+ "step": 272
1914
+ },
1915
+ {
1916
+ "epoch": 0.6261467889908257,
1917
+ "grad_norm": 7.904975891113281,
1918
+ "learning_rate": 4.76633568257899e-05,
1919
+ "loss": 2.2547,
1920
+ "step": 273
1921
+ },
1922
+ {
1923
+ "epoch": 0.6284403669724771,
1924
+ "grad_norm": 6.577536582946777,
1925
+ "learning_rate": 4.7196477638140404e-05,
1926
+ "loss": 1.984,
1927
+ "step": 274
1928
+ },
1929
+ {
1930
+ "epoch": 0.6307339449541285,
1931
+ "grad_norm": 8.355478286743164,
1932
+ "learning_rate": 4.6729843538492847e-05,
1933
+ "loss": 2.227,
1934
+ "step": 275
1935
+ },
1936
+ {
1937
+ "epoch": 0.6330275229357798,
1938
+ "grad_norm": 8.487154006958008,
1939
+ "learning_rate": 4.626349532067879e-05,
1940
+ "loss": 2.1905,
1941
+ "step": 276
1942
+ },
1943
+ {
1944
+ "epoch": 0.6353211009174312,
1945
+ "grad_norm": 8.718094825744629,
1946
+ "learning_rate": 4.579747375353763e-05,
1947
+ "loss": 2.0898,
1948
+ "step": 277
1949
+ },
1950
+ {
1951
+ "epoch": 0.6376146788990825,
1952
+ "grad_norm": 9.59514045715332,
1953
+ "learning_rate": 4.5331819577352474e-05,
1954
+ "loss": 2.1952,
1955
+ "step": 278
1956
+ },
1957
+ {
1958
+ "epoch": 0.6399082568807339,
1959
+ "grad_norm": 6.1734538078308105,
1960
+ "learning_rate": 4.486657350028859e-05,
1961
+ "loss": 1.4662,
1962
+ "step": 279
1963
+ },
1964
+ {
1965
+ "epoch": 0.6422018348623854,
1966
+ "grad_norm": 8.729260444641113,
1967
+ "learning_rate": 4.4401776194834613e-05,
1968
+ "loss": 2.3723,
1969
+ "step": 280
1970
+ },
1971
+ {
1972
+ "epoch": 0.6444954128440367,
1973
+ "grad_norm": 9.038458824157715,
1974
+ "learning_rate": 4.393746829424693e-05,
1975
+ "loss": 1.5177,
1976
+ "step": 281
1977
+ },
1978
+ {
1979
+ "epoch": 0.6467889908256881,
1980
+ "grad_norm": 10.071760177612305,
1981
+ "learning_rate": 4.347369038899744e-05,
1982
+ "loss": 1.8683,
1983
+ "step": 282
1984
+ },
1985
+ {
1986
+ "epoch": 0.6490825688073395,
1987
+ "grad_norm": 9.053478240966797,
1988
+ "learning_rate": 4.3010483023225045e-05,
1989
+ "loss": 1.7053,
1990
+ "step": 283
1991
+ },
1992
+ {
1993
+ "epoch": 0.6513761467889908,
1994
+ "grad_norm": 8.493107795715332,
1995
+ "learning_rate": 4.254788669119127e-05,
1996
+ "loss": 0.8987,
1997
+ "step": 284
1998
+ },
1999
+ {
2000
+ "epoch": 0.6536697247706422,
2001
+ "grad_norm": 9.247946739196777,
2002
+ "learning_rate": 4.208594183374026e-05,
2003
+ "loss": 0.7432,
2004
+ "step": 285
2005
+ },
2006
+ {
2007
+ "epoch": 0.6559633027522935,
2008
+ "grad_norm": 7.0636115074157715,
2009
+ "learning_rate": 4.162468883476319e-05,
2010
+ "loss": 0.6019,
2011
+ "step": 286
2012
+ },
2013
+ {
2014
+ "epoch": 0.658256880733945,
2015
+ "grad_norm": 11.127235412597656,
2016
+ "learning_rate": 4.1164168017668e-05,
2017
+ "loss": 2.198,
2018
+ "step": 287
2019
+ },
2020
+ {
2021
+ "epoch": 0.6605504587155964,
2022
+ "grad_norm": 13.40218448638916,
2023
+ "learning_rate": 4.0704419641854274e-05,
2024
+ "loss": 2.4764,
2025
+ "step": 288
2026
+ },
2027
+ {
2028
+ "epoch": 0.6628440366972477,
2029
+ "grad_norm": 6.9688029289245605,
2030
+ "learning_rate": 4.0245483899193595e-05,
2031
+ "loss": 1.2261,
2032
+ "step": 289
2033
+ },
2034
+ {
2035
+ "epoch": 0.6651376146788991,
2036
+ "grad_norm": 20.206615447998047,
2037
+ "learning_rate": 3.978740091051599e-05,
2038
+ "loss": 2.6922,
2039
+ "step": 290
2040
+ },
2041
+ {
2042
+ "epoch": 0.6674311926605505,
2043
+ "grad_norm": 9.008017539978027,
2044
+ "learning_rate": 3.933021072210251e-05,
2045
+ "loss": 1.2186,
2046
+ "step": 291
2047
+ },
2048
+ {
2049
+ "epoch": 0.6697247706422018,
2050
+ "grad_norm": 53.592010498046875,
2051
+ "learning_rate": 3.887395330218429e-05,
2052
+ "loss": 0.7876,
2053
+ "step": 292
2054
+ },
2055
+ {
2056
+ "epoch": 0.6720183486238532,
2057
+ "grad_norm": 5.339064121246338,
2058
+ "learning_rate": 3.8418668537448495e-05,
2059
+ "loss": 0.6078,
2060
+ "step": 293
2061
+ },
2062
+ {
2063
+ "epoch": 0.6743119266055045,
2064
+ "grad_norm": 3.749335765838623,
2065
+ "learning_rate": 3.7964396229551364e-05,
2066
+ "loss": 0.4519,
2067
+ "step": 294
2068
+ },
2069
+ {
2070
+ "epoch": 0.676605504587156,
2071
+ "grad_norm": 4.613084316253662,
2072
+ "learning_rate": 3.7511176091638653e-05,
2073
+ "loss": 0.5232,
2074
+ "step": 295
2075
+ },
2076
+ {
2077
+ "epoch": 0.6788990825688074,
2078
+ "grad_norm": 4.828958511352539,
2079
+ "learning_rate": 3.705904774487396e-05,
2080
+ "loss": 0.4491,
2081
+ "step": 296
2082
+ },
2083
+ {
2084
+ "epoch": 0.6811926605504587,
2085
+ "grad_norm": 3.427786111831665,
2086
+ "learning_rate": 3.660805071497485e-05,
2087
+ "loss": 0.4936,
2088
+ "step": 297
2089
+ },
2090
+ {
2091
+ "epoch": 0.6834862385321101,
2092
+ "grad_norm": 3.4336676597595215,
2093
+ "learning_rate": 3.6158224428757535e-05,
2094
+ "loss": 0.5795,
2095
+ "step": 298
2096
+ },
2097
+ {
2098
+ "epoch": 0.6857798165137615,
2099
+ "grad_norm": 3.517413377761841,
2100
+ "learning_rate": 3.5709608210690125e-05,
2101
+ "loss": 0.5185,
2102
+ "step": 299
2103
+ },
2104
+ {
2105
+ "epoch": 0.6880733944954128,
2106
+ "grad_norm": 4.021430969238281,
2107
+ "learning_rate": 3.5262241279454785e-05,
2108
+ "loss": 0.7537,
2109
+ "step": 300
2110
+ },
2111
+ {
2112
+ "epoch": 0.6903669724770642,
2113
+ "grad_norm": 1.3459668159484863,
2114
+ "learning_rate": 3.4816162744519263e-05,
2115
+ "loss": 0.7576,
2116
+ "step": 301
2117
+ },
2118
+ {
2119
+ "epoch": 0.6926605504587156,
2120
+ "grad_norm": 1.2463675737380981,
2121
+ "learning_rate": 3.4371411602717784e-05,
2122
+ "loss": 0.4637,
2123
+ "step": 302
2124
+ },
2125
+ {
2126
+ "epoch": 0.694954128440367,
2127
+ "grad_norm": 1.3952363729476929,
2128
+ "learning_rate": 3.392802673484193e-05,
2129
+ "loss": 0.6165,
2130
+ "step": 303
2131
+ },
2132
+ {
2133
+ "epoch": 0.6972477064220184,
2134
+ "grad_norm": 1.5832312107086182,
2135
+ "learning_rate": 3.3486046902241664e-05,
2136
+ "loss": 0.7459,
2137
+ "step": 304
2138
+ },
2139
+ {
2140
+ "epoch": 0.6995412844036697,
2141
+ "grad_norm": 1.7343112230300903,
2142
+ "learning_rate": 3.3045510743436665e-05,
2143
+ "loss": 0.785,
2144
+ "step": 305
2145
+ },
2146
+ {
2147
+ "epoch": 0.7018348623853211,
2148
+ "grad_norm": 2.200110673904419,
2149
+ "learning_rate": 3.2606456770738636e-05,
2150
+ "loss": 1.1081,
2151
+ "step": 306
2152
+ },
2153
+ {
2154
+ "epoch": 0.7041284403669725,
2155
+ "grad_norm": 1.953981637954712,
2156
+ "learning_rate": 3.216892336688435e-05,
2157
+ "loss": 0.8974,
2158
+ "step": 307
2159
+ },
2160
+ {
2161
+ "epoch": 0.7064220183486238,
2162
+ "grad_norm": 2.165806293487549,
2163
+ "learning_rate": 3.173294878168025e-05,
2164
+ "loss": 1.0284,
2165
+ "step": 308
2166
+ },
2167
+ {
2168
+ "epoch": 0.7087155963302753,
2169
+ "grad_norm": 1.6866214275360107,
2170
+ "learning_rate": 3.129857112865859e-05,
2171
+ "loss": 0.6057,
2172
+ "step": 309
2173
+ },
2174
+ {
2175
+ "epoch": 0.7110091743119266,
2176
+ "grad_norm": 2.105428457260132,
2177
+ "learning_rate": 3.086582838174551e-05,
2178
+ "loss": 0.7096,
2179
+ "step": 310
2180
+ },
2181
+ {
2182
+ "epoch": 0.713302752293578,
2183
+ "grad_norm": 3.2524518966674805,
2184
+ "learning_rate": 3.0434758371941258e-05,
2185
+ "loss": 1.5531,
2186
+ "step": 311
2187
+ },
2188
+ {
2189
+ "epoch": 0.7155963302752294,
2190
+ "grad_norm": 2.998818874359131,
2191
+ "learning_rate": 3.000539878401296e-05,
2192
+ "loss": 1.1911,
2193
+ "step": 312
2194
+ },
2195
+ {
2196
+ "epoch": 0.7178899082568807,
2197
+ "grad_norm": 3.267188549041748,
2198
+ "learning_rate": 2.9577787153200197e-05,
2199
+ "loss": 1.2269,
2200
+ "step": 313
2201
+ },
2202
+ {
2203
+ "epoch": 0.7201834862385321,
2204
+ "grad_norm": 3.965543270111084,
2205
+ "learning_rate": 2.9151960861933614e-05,
2206
+ "loss": 1.5541,
2207
+ "step": 314
2208
+ },
2209
+ {
2210
+ "epoch": 0.7224770642201835,
2211
+ "grad_norm": 4.665194511413574,
2212
+ "learning_rate": 2.8727957136566823e-05,
2213
+ "loss": 2.0018,
2214
+ "step": 315
2215
+ },
2216
+ {
2217
+ "epoch": 0.7247706422018348,
2218
+ "grad_norm": 4.473689079284668,
2219
+ "learning_rate": 2.8305813044122097e-05,
2220
+ "loss": 1.6075,
2221
+ "step": 316
2222
+ },
2223
+ {
2224
+ "epoch": 0.7270642201834863,
2225
+ "grad_norm": 5.812178611755371,
2226
+ "learning_rate": 2.7885565489049946e-05,
2227
+ "loss": 2.3831,
2228
+ "step": 317
2229
+ },
2230
+ {
2231
+ "epoch": 0.7293577981651376,
2232
+ "grad_norm": 6.922824382781982,
2233
+ "learning_rate": 2.746725121000273e-05,
2234
+ "loss": 2.4602,
2235
+ "step": 318
2236
+ },
2237
+ {
2238
+ "epoch": 0.731651376146789,
2239
+ "grad_norm": 7.8420329093933105,
2240
+ "learning_rate": 2.705090677662311e-05,
2241
+ "loss": 2.3614,
2242
+ "step": 319
2243
+ },
2244
+ {
2245
+ "epoch": 0.7339449541284404,
2246
+ "grad_norm": 7.710655689239502,
2247
+ "learning_rate": 2.66365685863469e-05,
2248
+ "loss": 2.6817,
2249
+ "step": 320
2250
+ },
2251
+ {
2252
+ "epoch": 0.7362385321100917,
2253
+ "grad_norm": 6.691256999969482,
2254
+ "learning_rate": 2.6224272861221243e-05,
2255
+ "loss": 1.5446,
2256
+ "step": 321
2257
+ },
2258
+ {
2259
+ "epoch": 0.7385321100917431,
2260
+ "grad_norm": 8.001251220703125,
2261
+ "learning_rate": 2.581405564473801e-05,
2262
+ "loss": 2.3128,
2263
+ "step": 322
2264
+ },
2265
+ {
2266
+ "epoch": 0.7408256880733946,
2267
+ "grad_norm": 7.6729583740234375,
2268
+ "learning_rate": 2.5405952798682842e-05,
2269
+ "loss": 2.1427,
2270
+ "step": 323
2271
+ },
2272
+ {
2273
+ "epoch": 0.7431192660550459,
2274
+ "grad_norm": 7.609097957611084,
2275
+ "learning_rate": 2.500000000000001e-05,
2276
+ "loss": 1.8877,
2277
+ "step": 324
2278
+ },
2279
+ {
2280
+ "epoch": 0.7454128440366973,
2281
+ "grad_norm": 7.955555438995361,
2282
+ "learning_rate": 2.459623273767354e-05,
2283
+ "loss": 2.0801,
2284
+ "step": 325
2285
+ },
2286
+ {
2287
+ "epoch": 0.7477064220183486,
2288
+ "grad_norm": 8.366448402404785,
2289
+ "learning_rate": 2.4194686309624663e-05,
2290
+ "loss": 2.079,
2291
+ "step": 326
2292
+ },
2293
+ {
2294
+ "epoch": 0.75,
2295
+ "grad_norm": 7.975131511688232,
2296
+ "learning_rate": 2.3795395819626114e-05,
2297
+ "loss": 1.9748,
2298
+ "step": 327
2299
+ },
2300
+ {
2301
+ "epoch": 0.7522935779816514,
2302
+ "grad_norm": 8.098983764648438,
2303
+ "learning_rate": 2.3398396174233178e-05,
2304
+ "loss": 2.0463,
2305
+ "step": 328
2306
+ },
2307
+ {
2308
+ "epoch": 0.7545871559633027,
2309
+ "grad_norm": 8.904077529907227,
2310
+ "learning_rate": 2.300372207973219e-05,
2311
+ "loss": 2.1938,
2312
+ "step": 329
2313
+ },
2314
+ {
2315
+ "epoch": 0.7568807339449541,
2316
+ "grad_norm": 11.033387184143066,
2317
+ "learning_rate": 2.261140803910644e-05,
2318
+ "loss": 1.4517,
2319
+ "step": 330
2320
+ },
2321
+ {
2322
+ "epoch": 0.7591743119266054,
2323
+ "grad_norm": 9.668691635131836,
2324
+ "learning_rate": 2.2221488349019903e-05,
2325
+ "loss": 1.9265,
2326
+ "step": 331
2327
+ },
2328
+ {
2329
+ "epoch": 0.7614678899082569,
2330
+ "grad_norm": 6.776542663574219,
2331
+ "learning_rate": 2.1833997096818898e-05,
2332
+ "loss": 1.1323,
2333
+ "step": 332
2334
+ },
2335
+ {
2336
+ "epoch": 0.7637614678899083,
2337
+ "grad_norm": 7.969955921173096,
2338
+ "learning_rate": 2.144896815755224e-05,
2339
+ "loss": 1.5859,
2340
+ "step": 333
2341
+ },
2342
+ {
2343
+ "epoch": 0.7660550458715596,
2344
+ "grad_norm": 8.130461692810059,
2345
+ "learning_rate": 2.1066435191009715e-05,
2346
+ "loss": 1.4637,
2347
+ "step": 334
2348
+ },
2349
+ {
2350
+ "epoch": 0.768348623853211,
2351
+ "grad_norm": 8.98421573638916,
2352
+ "learning_rate": 2.0686431638779564e-05,
2353
+ "loss": 1.7904,
2354
+ "step": 335
2355
+ },
2356
+ {
2357
+ "epoch": 0.7706422018348624,
2358
+ "grad_norm": 11.332923889160156,
2359
+ "learning_rate": 2.0308990721324927e-05,
2360
+ "loss": 2.4356,
2361
+ "step": 336
2362
+ },
2363
+ {
2364
+ "epoch": 0.7729357798165137,
2365
+ "grad_norm": 12.836410522460938,
2366
+ "learning_rate": 1.9934145435079702e-05,
2367
+ "loss": 2.4158,
2368
+ "step": 337
2369
+ },
2370
+ {
2371
+ "epoch": 0.7752293577981652,
2372
+ "grad_norm": 8.298335075378418,
2373
+ "learning_rate": 1.9561928549563968e-05,
2374
+ "loss": 1.2192,
2375
+ "step": 338
2376
+ },
2377
+ {
2378
+ "epoch": 0.7775229357798165,
2379
+ "grad_norm": 15.296181678771973,
2380
+ "learning_rate": 1.9192372604519127e-05,
2381
+ "loss": 1.7368,
2382
+ "step": 339
2383
+ },
2384
+ {
2385
+ "epoch": 0.7798165137614679,
2386
+ "grad_norm": 29.77173614501953,
2387
+ "learning_rate": 1.8825509907063327e-05,
2388
+ "loss": 2.0105,
2389
+ "step": 340
2390
+ },
2391
+ {
2392
+ "epoch": 0.7821100917431193,
2393
+ "grad_norm": 52.93706512451172,
2394
+ "learning_rate": 1.8461372528867093e-05,
2395
+ "loss": 0.9187,
2396
+ "step": 341
2397
+ },
2398
+ {
2399
+ "epoch": 0.7844036697247706,
2400
+ "grad_norm": 9.151922225952148,
2401
+ "learning_rate": 1.8099992303349577e-05,
2402
+ "loss": 0.4775,
2403
+ "step": 342
2404
+ },
2405
+ {
2406
+ "epoch": 0.786697247706422,
2407
+ "grad_norm": 4.057742595672607,
2408
+ "learning_rate": 1.774140082289563e-05,
2409
+ "loss": 0.4805,
2410
+ "step": 343
2411
+ },
2412
+ {
2413
+ "epoch": 0.7889908256880734,
2414
+ "grad_norm": 5.886656284332275,
2415
+ "learning_rate": 1.738562943609396e-05,
2416
+ "loss": 0.5288,
2417
+ "step": 344
2418
+ },
2419
+ {
2420
+ "epoch": 0.7912844036697247,
2421
+ "grad_norm": 3.6186325550079346,
2422
+ "learning_rate": 1.703270924499656e-05,
2423
+ "loss": 0.471,
2424
+ "step": 345
2425
+ },
2426
+ {
2427
+ "epoch": 0.7935779816513762,
2428
+ "grad_norm": 3.7122743129730225,
2429
+ "learning_rate": 1.6682671102399805e-05,
2430
+ "loss": 0.4429,
2431
+ "step": 346
2432
+ },
2433
+ {
2434
+ "epoch": 0.7958715596330275,
2435
+ "grad_norm": 3.1317362785339355,
2436
+ "learning_rate": 1.6335545609147142e-05,
2437
+ "loss": 0.5182,
2438
+ "step": 347
2439
+ },
2440
+ {
2441
+ "epoch": 0.7981651376146789,
2442
+ "grad_norm": 3.575540065765381,
2443
+ "learning_rate": 1.599136311145402e-05,
2444
+ "loss": 0.6401,
2445
+ "step": 348
2446
+ },
2447
+ {
2448
+ "epoch": 0.8004587155963303,
2449
+ "grad_norm": 3.384408712387085,
2450
+ "learning_rate": 1.5650153698254916e-05,
2451
+ "loss": 0.5007,
2452
+ "step": 349
2453
+ },
2454
+ {
2455
+ "epoch": 0.8027522935779816,
2456
+ "grad_norm": 3.7142014503479004,
2457
+ "learning_rate": 1.531194719857292e-05,
2458
+ "loss": 0.5944,
2459
+ "step": 350
2460
+ },
2461
+ {
2462
+ "epoch": 0.805045871559633,
2463
+ "grad_norm": 1.2152124643325806,
2464
+ "learning_rate": 1.4976773178912084e-05,
2465
+ "loss": 0.5656,
2466
+ "step": 351
2467
+ },
2468
+ {
2469
+ "epoch": 0.8073394495412844,
2470
+ "grad_norm": 1.2784579992294312,
2471
+ "learning_rate": 1.4644660940672627e-05,
2472
+ "loss": 0.5921,
2473
+ "step": 352
2474
+ },
2475
+ {
2476
+ "epoch": 0.8096330275229358,
2477
+ "grad_norm": 1.2699741125106812,
2478
+ "learning_rate": 1.4315639517589397e-05,
2479
+ "loss": 0.485,
2480
+ "step": 353
2481
+ },
2482
+ {
2483
+ "epoch": 0.8119266055045872,
2484
+ "grad_norm": 1.3509347438812256,
2485
+ "learning_rate": 1.398973767319368e-05,
2486
+ "loss": 0.5775,
2487
+ "step": 354
2488
+ },
2489
+ {
2490
+ "epoch": 0.8142201834862385,
2491
+ "grad_norm": 2.171257495880127,
2492
+ "learning_rate": 1.3666983898298657e-05,
2493
+ "loss": 1.2049,
2494
+ "step": 355
2495
+ },
2496
+ {
2497
+ "epoch": 0.8165137614678899,
2498
+ "grad_norm": 1.8562023639678955,
2499
+ "learning_rate": 1.3347406408508695e-05,
2500
+ "loss": 0.8346,
2501
+ "step": 356
2502
+ },
2503
+ {
2504
+ "epoch": 0.8188073394495413,
2505
+ "grad_norm": 1.9856826066970825,
2506
+ "learning_rate": 1.3031033141752702e-05,
2507
+ "loss": 0.8304,
2508
+ "step": 357
2509
+ },
2510
+ {
2511
+ "epoch": 0.8211009174311926,
2512
+ "grad_norm": 2.337763547897339,
2513
+ "learning_rate": 1.2717891755841722e-05,
2514
+ "loss": 1.2159,
2515
+ "step": 358
2516
+ },
2517
+ {
2518
+ "epoch": 0.823394495412844,
2519
+ "grad_norm": 2.3222556114196777,
2520
+ "learning_rate": 1.2408009626051137e-05,
2521
+ "loss": 1.0698,
2522
+ "step": 359
2523
+ },
2524
+ {
2525
+ "epoch": 0.8256880733944955,
2526
+ "grad_norm": 2.8859426975250244,
2527
+ "learning_rate": 1.2101413842727345e-05,
2528
+ "loss": 1.46,
2529
+ "step": 360
2530
+ },
2531
+ {
2532
+ "epoch": 0.8279816513761468,
2533
+ "grad_norm": 2.7983813285827637,
2534
+ "learning_rate": 1.1798131208919627e-05,
2535
+ "loss": 1.2568,
2536
+ "step": 361
2537
+ },
2538
+ {
2539
+ "epoch": 0.8302752293577982,
2540
+ "grad_norm": 3.1168298721313477,
2541
+ "learning_rate": 1.1498188238036861e-05,
2542
+ "loss": 1.3348,
2543
+ "step": 362
2544
+ },
2545
+ {
2546
+ "epoch": 0.8325688073394495,
2547
+ "grad_norm": 3.040313482284546,
2548
+ "learning_rate": 1.1201611151529756e-05,
2549
+ "loss": 0.9272,
2550
+ "step": 363
2551
+ },
2552
+ {
2553
+ "epoch": 0.8348623853211009,
2554
+ "grad_norm": 3.5628716945648193,
2555
+ "learning_rate": 1.090842587659851e-05,
2556
+ "loss": 1.1717,
2557
+ "step": 364
2558
+ },
2559
+ {
2560
+ "epoch": 0.8371559633027523,
2561
+ "grad_norm": 4.757216453552246,
2562
+ "learning_rate": 1.0618658043926233e-05,
2563
+ "loss": 1.7844,
2564
+ "step": 365
2565
+ },
2566
+ {
2567
+ "epoch": 0.8394495412844036,
2568
+ "grad_norm": 5.807982444763184,
2569
+ "learning_rate": 1.0332332985438248e-05,
2570
+ "loss": 1.8459,
2571
+ "step": 366
2572
+ },
2573
+ {
2574
+ "epoch": 0.841743119266055,
2575
+ "grad_norm": 6.931635856628418,
2576
+ "learning_rate": 1.004947573208756e-05,
2577
+ "loss": 2.4206,
2578
+ "step": 367
2579
+ },
2580
+ {
2581
+ "epoch": 0.8440366972477065,
2582
+ "grad_norm": 6.323002338409424,
2583
+ "learning_rate": 9.770111011666583e-06,
2584
+ "loss": 1.7975,
2585
+ "step": 368
2586
+ },
2587
+ {
2588
+ "epoch": 0.8463302752293578,
2589
+ "grad_norm": 6.554073333740234,
2590
+ "learning_rate": 9.494263246645474e-06,
2591
+ "loss": 2.1683,
2592
+ "step": 369
2593
+ },
2594
+ {
2595
+ "epoch": 0.8486238532110092,
2596
+ "grad_norm": 7.123667240142822,
2597
+ "learning_rate": 9.221956552036992e-06,
2598
+ "loss": 2.4647,
2599
+ "step": 370
2600
+ },
2601
+ {
2602
+ "epoch": 0.8509174311926605,
2603
+ "grad_norm": 7.333261966705322,
2604
+ "learning_rate": 8.953214733288383e-06,
2605
+ "loss": 2.4145,
2606
+ "step": 371
2607
+ },
2608
+ {
2609
+ "epoch": 0.8532110091743119,
2610
+ "grad_norm": 7.81040096282959,
2611
+ "learning_rate": 8.688061284200266e-06,
2612
+ "loss": 2.5398,
2613
+ "step": 372
2614
+ },
2615
+ {
2616
+ "epoch": 0.8555045871559633,
2617
+ "grad_norm": 7.140593528747559,
2618
+ "learning_rate": 8.426519384872733e-06,
2619
+ "loss": 1.8465,
2620
+ "step": 373
2621
+ },
2622
+ {
2623
+ "epoch": 0.8577981651376146,
2624
+ "grad_norm": 8.926170349121094,
2625
+ "learning_rate": 8.168611899679013e-06,
2626
+ "loss": 2.4094,
2627
+ "step": 374
2628
+ },
2629
+ {
2630
+ "epoch": 0.8600917431192661,
2631
+ "grad_norm": 8.442564010620117,
2632
+ "learning_rate": 7.914361375266504e-06,
2633
+ "loss": 2.2949,
2634
+ "step": 375
2635
+ },
2636
+ {
2637
+ "epoch": 0.8623853211009175,
2638
+ "grad_norm": 7.919826030731201,
2639
+ "learning_rate": 7.663790038585793e-06,
2640
+ "loss": 2.1068,
2641
+ "step": 376
2642
+ },
2643
+ {
2644
+ "epoch": 0.8646788990825688,
2645
+ "grad_norm": 8.824353218078613,
2646
+ "learning_rate": 7.416919794947536e-06,
2647
+ "loss": 2.7126,
2648
+ "step": 377
2649
+ },
2650
+ {
2651
+ "epoch": 0.8669724770642202,
2652
+ "grad_norm": 9.781707763671875,
2653
+ "learning_rate": 7.173772226107434e-06,
2654
+ "loss": 2.4447,
2655
+ "step": 378
2656
+ },
2657
+ {
2658
+ "epoch": 0.8692660550458715,
2659
+ "grad_norm": 6.505077838897705,
2660
+ "learning_rate": 6.934368588379553e-06,
2661
+ "loss": 1.1717,
2662
+ "step": 379
2663
+ },
2664
+ {
2665
+ "epoch": 0.8715596330275229,
2666
+ "grad_norm": 9.621846199035645,
2667
+ "learning_rate": 6.698729810778065e-06,
2668
+ "loss": 1.8266,
2669
+ "step": 380
2670
+ },
2671
+ {
2672
+ "epoch": 0.8738532110091743,
2673
+ "grad_norm": 9.959877014160156,
2674
+ "learning_rate": 6.46687649318759e-06,
2675
+ "loss": 1.2395,
2676
+ "step": 381
2677
+ },
2678
+ {
2679
+ "epoch": 0.8761467889908257,
2680
+ "grad_norm": 5.53802490234375,
2681
+ "learning_rate": 6.238828904562316e-06,
2682
+ "loss": 0.4439,
2683
+ "step": 382
2684
+ },
2685
+ {
2686
+ "epoch": 0.8784403669724771,
2687
+ "grad_norm": 4.555595397949219,
2688
+ "learning_rate": 6.014606981154086e-06,
2689
+ "loss": 0.3088,
2690
+ "step": 383
2691
+ },
2692
+ {
2693
+ "epoch": 0.8807339449541285,
2694
+ "grad_norm": 9.899633407592773,
2695
+ "learning_rate": 5.794230324769517e-06,
2696
+ "loss": 1.7023,
2697
+ "step": 384
2698
+ },
2699
+ {
2700
+ "epoch": 0.8830275229357798,
2701
+ "grad_norm": 11.428282737731934,
2702
+ "learning_rate": 5.577718201056392e-06,
2703
+ "loss": 2.7028,
2704
+ "step": 385
2705
+ },
2706
+ {
2707
+ "epoch": 0.8853211009174312,
2708
+ "grad_norm": 11.778576850891113,
2709
+ "learning_rate": 5.365089537819434e-06,
2710
+ "loss": 2.2125,
2711
+ "step": 386
2712
+ },
2713
+ {
2714
+ "epoch": 0.8876146788990825,
2715
+ "grad_norm": 13.558716773986816,
2716
+ "learning_rate": 5.156362923365588e-06,
2717
+ "loss": 2.5294,
2718
+ "step": 387
2719
+ },
2720
+ {
2721
+ "epoch": 0.8899082568807339,
2722
+ "grad_norm": 14.217622756958008,
2723
+ "learning_rate": 4.951556604879048e-06,
2724
+ "loss": 2.5287,
2725
+ "step": 388
2726
+ },
2727
+ {
2728
+ "epoch": 0.8922018348623854,
2729
+ "grad_norm": 10.065847396850586,
2730
+ "learning_rate": 4.7506884868259995e-06,
2731
+ "loss": 1.123,
2732
+ "step": 389
2733
+ },
2734
+ {
2735
+ "epoch": 0.8944954128440367,
2736
+ "grad_norm": 45.029380798339844,
2737
+ "learning_rate": 4.5537761293894535e-06,
2738
+ "loss": 0.8917,
2739
+ "step": 390
2740
+ },
2741
+ {
2742
+ "epoch": 0.8967889908256881,
2743
+ "grad_norm": 14.9079008102417,
2744
+ "learning_rate": 4.360836746934055e-06,
2745
+ "loss": 0.6098,
2746
+ "step": 391
2747
+ },
2748
+ {
2749
+ "epoch": 0.8990825688073395,
2750
+ "grad_norm": 3.1857004165649414,
2751
+ "learning_rate": 4.1718872065011904e-06,
2752
+ "loss": 0.4474,
2753
+ "step": 392
2754
+ },
2755
+ {
2756
+ "epoch": 0.9013761467889908,
2757
+ "grad_norm": 3.916781425476074,
2758
+ "learning_rate": 3.9869440263344714e-06,
2759
+ "loss": 0.6104,
2760
+ "step": 393
2761
+ },
2762
+ {
2763
+ "epoch": 0.9036697247706422,
2764
+ "grad_norm": 3.7978272438049316,
2765
+ "learning_rate": 3.8060233744356633e-06,
2766
+ "loss": 0.4462,
2767
+ "step": 394
2768
+ },
2769
+ {
2770
+ "epoch": 0.9059633027522935,
2771
+ "grad_norm": 4.325674057006836,
2772
+ "learning_rate": 3.6291410671512594e-06,
2773
+ "loss": 0.467,
2774
+ "step": 395
2775
+ },
2776
+ {
2777
+ "epoch": 0.908256880733945,
2778
+ "grad_norm": 3.895176887512207,
2779
+ "learning_rate": 3.4563125677897932e-06,
2780
+ "loss": 0.4774,
2781
+ "step": 396
2782
+ },
2783
+ {
2784
+ "epoch": 0.9105504587155964,
2785
+ "grad_norm": 4.118838787078857,
2786
+ "learning_rate": 3.2875529852700147e-06,
2787
+ "loss": 0.6108,
2788
+ "step": 397
2789
+ },
2790
+ {
2791
+ "epoch": 0.9128440366972477,
2792
+ "grad_norm": 3.1684885025024414,
2793
+ "learning_rate": 3.1228770728000455e-06,
2794
+ "loss": 0.5086,
2795
+ "step": 398
2796
+ },
2797
+ {
2798
+ "epoch": 0.9151376146788991,
2799
+ "grad_norm": 4.82421350479126,
2800
+ "learning_rate": 2.9622992265876392e-06,
2801
+ "loss": 0.7931,
2802
+ "step": 399
2803
+ },
2804
+ {
2805
+ "epoch": 0.9174311926605505,
2806
+ "grad_norm": 4.262173652648926,
2807
+ "learning_rate": 2.8058334845816213e-06,
2808
+ "loss": 0.7849,
2809
+ "step": 400
2810
+ },
2811
+ {
2812
+ "epoch": 0.9197247706422018,
2813
+ "grad_norm": 1.1381596326828003,
2814
+ "learning_rate": 2.653493525244721e-06,
2815
+ "loss": 0.6932,
2816
+ "step": 401
2817
+ },
2818
+ {
2819
+ "epoch": 0.9220183486238532,
2820
+ "grad_norm": 1.1581947803497314,
2821
+ "learning_rate": 2.5052926663577e-06,
2822
+ "loss": 0.5253,
2823
+ "step": 402
2824
+ },
2825
+ {
2826
+ "epoch": 0.9243119266055045,
2827
+ "grad_norm": 1.5183253288269043,
2828
+ "learning_rate": 2.361243863855184e-06,
2829
+ "loss": 0.853,
2830
+ "step": 403
2831
+ },
2832
+ {
2833
+ "epoch": 0.926605504587156,
2834
+ "grad_norm": 1.571021556854248,
2835
+ "learning_rate": 2.221359710692961e-06,
2836
+ "loss": 0.6197,
2837
+ "step": 404
2838
+ },
2839
+ {
2840
+ "epoch": 0.9288990825688074,
2841
+ "grad_norm": 1.6061910390853882,
2842
+ "learning_rate": 2.085652435747132e-06,
2843
+ "loss": 0.7312,
2844
+ "step": 405
2845
+ },
2846
+ {
2847
+ "epoch": 0.9311926605504587,
2848
+ "grad_norm": 2.495539665222168,
2849
+ "learning_rate": 1.9541339027450256e-06,
2850
+ "loss": 1.4046,
2851
+ "step": 406
2852
+ },
2853
+ {
2854
+ "epoch": 0.9334862385321101,
2855
+ "grad_norm": 2.363898277282715,
2856
+ "learning_rate": 1.8268156092280496e-06,
2857
+ "loss": 1.1194,
2858
+ "step": 407
2859
+ },
2860
+ {
2861
+ "epoch": 0.9357798165137615,
2862
+ "grad_norm": 2.802872657775879,
2863
+ "learning_rate": 1.70370868554659e-06,
2864
+ "loss": 1.3099,
2865
+ "step": 408
2866
+ },
2867
+ {
2868
+ "epoch": 0.9380733944954128,
2869
+ "grad_norm": 3.056849479675293,
2870
+ "learning_rate": 1.584823893886933e-06,
2871
+ "loss": 1.1909,
2872
+ "step": 409
2873
+ },
2874
+ {
2875
+ "epoch": 0.9403669724770642,
2876
+ "grad_norm": 3.2700366973876953,
2877
+ "learning_rate": 1.4701716273304521e-06,
2878
+ "loss": 1.2238,
2879
+ "step": 410
2880
+ },
2881
+ {
2882
+ "epoch": 0.9426605504587156,
2883
+ "grad_norm": 3.4624550342559814,
2884
+ "learning_rate": 1.3597619089450342e-06,
2885
+ "loss": 1.2718,
2886
+ "step": 411
2887
+ },
2888
+ {
2889
+ "epoch": 0.944954128440367,
2890
+ "grad_norm": 4.333258628845215,
2891
+ "learning_rate": 1.2536043909088191e-06,
2892
+ "loss": 1.7671,
2893
+ "step": 412
2894
+ },
2895
+ {
2896
+ "epoch": 0.9472477064220184,
2897
+ "grad_norm": 4.528647422790527,
2898
+ "learning_rate": 1.1517083536664142e-06,
2899
+ "loss": 1.4721,
2900
+ "step": 413
2901
+ },
2902
+ {
2903
+ "epoch": 0.9495412844036697,
2904
+ "grad_norm": 5.667140483856201,
2905
+ "learning_rate": 1.0540827051175818e-06,
2906
+ "loss": 1.8333,
2907
+ "step": 414
2908
+ },
2909
+ {
2910
+ "epoch": 0.9518348623853211,
2911
+ "grad_norm": 7.816887378692627,
2912
+ "learning_rate": 9.607359798384785e-07,
2913
+ "loss": 2.4681,
2914
+ "step": 415
2915
+ },
2916
+ {
2917
+ "epoch": 0.9541284403669725,
2918
+ "grad_norm": 6.785305976867676,
2919
+ "learning_rate": 8.716763383355864e-07,
2920
+ "loss": 2.1718,
2921
+ "step": 416
2922
+ },
2923
+ {
2924
+ "epoch": 0.9564220183486238,
2925
+ "grad_norm": 5.145172595977783,
2926
+ "learning_rate": 7.869115663322879e-07,
2927
+ "loss": 1.2761,
2928
+ "step": 417
2929
+ },
2930
+ {
2931
+ "epoch": 0.9587155963302753,
2932
+ "grad_norm": 5.58955717086792,
2933
+ "learning_rate": 7.064490740882057e-07,
2934
+ "loss": 1.5643,
2935
+ "step": 418
2936
+ },
2937
+ {
2938
+ "epoch": 0.9610091743119266,
2939
+ "grad_norm": 7.9990458488464355,
2940
+ "learning_rate": 6.302958957514371e-07,
2941
+ "loss": 1.9113,
2942
+ "step": 419
2943
+ },
2944
+ {
2945
+ "epoch": 0.963302752293578,
2946
+ "grad_norm": 8.559749603271484,
2947
+ "learning_rate": 5.584586887435739e-07,
2948
+ "loss": 2.0603,
2949
+ "step": 420
2950
+ },
2951
+ {
2952
+ "epoch": 0.9655963302752294,
2953
+ "grad_norm": 8.403243064880371,
2954
+ "learning_rate": 4.909437331777179e-07,
2955
+ "loss": 2.0897,
2956
+ "step": 421
2957
+ },
2958
+ {
2959
+ "epoch": 0.9678899082568807,
2960
+ "grad_norm": 8.703038215637207,
2961
+ "learning_rate": 4.277569313094809e-07,
2962
+ "loss": 1.4529,
2963
+ "step": 422
2964
+ },
2965
+ {
2966
+ "epoch": 0.9701834862385321,
2967
+ "grad_norm": 10.667278289794922,
2968
+ "learning_rate": 3.689038070209594e-07,
2969
+ "loss": 1.466,
2970
+ "step": 423
2971
+ },
2972
+ {
2973
+ "epoch": 0.9724770642201835,
2974
+ "grad_norm": 8.07142162322998,
2975
+ "learning_rate": 3.143895053378698e-07,
2976
+ "loss": 1.4229,
2977
+ "step": 424
2978
+ },
2979
+ {
2980
+ "epoch": 0.9747706422018348,
2981
+ "grad_norm": 6.67749547958374,
2982
+ "learning_rate": 2.6421879197974784e-07,
2983
+ "loss": 0.8897,
2984
+ "step": 425
2985
+ },
2986
+ {
2987
+ "epoch": 0.9770642201834863,
2988
+ "grad_norm": 10.8878755569458,
2989
+ "learning_rate": 2.1839605294330933e-07,
2990
+ "loss": 2.0611,
2991
+ "step": 426
2992
+ },
2993
+ {
2994
+ "epoch": 0.9793577981651376,
2995
+ "grad_norm": 13.075303077697754,
2996
+ "learning_rate": 1.7692529411904578e-07,
2997
+ "loss": 2.3857,
2998
+ "step": 427
2999
+ },
3000
+ {
3001
+ "epoch": 0.981651376146789,
3002
+ "grad_norm": 16.536373138427734,
3003
+ "learning_rate": 1.3981014094099353e-07,
3004
+ "loss": 2.6068,
3005
+ "step": 428
3006
+ },
3007
+ {
3008
+ "epoch": 0.9839449541284404,
3009
+ "grad_norm": 20.786617279052734,
3010
+ "learning_rate": 1.0705383806982606e-07,
3011
+ "loss": 1.1238,
3012
+ "step": 429
3013
+ },
3014
+ {
3015
+ "epoch": 0.9862385321100917,
3016
+ "grad_norm": 12.918766975402832,
3017
+ "learning_rate": 7.865924910916977e-08,
3018
+ "loss": 0.4258,
3019
+ "step": 430
3020
+ },
3021
+ {
3022
+ "epoch": 0.9885321100917431,
3023
+ "grad_norm": 3.077303647994995,
3024
+ "learning_rate": 5.462885635529324e-08,
3025
+ "loss": 0.5306,
3026
+ "step": 431
3027
+ },
3028
+ {
3029
+ "epoch": 0.9908256880733946,
3030
+ "grad_norm": 3.1448254585266113,
3031
+ "learning_rate": 3.496476058006959e-08,
3032
+ "loss": 0.4716,
3033
+ "step": 432
3034
+ },
3035
+ {
3036
+ "epoch": 0.9931192660550459,
3037
+ "grad_norm": 3.913860321044922,
3038
+ "learning_rate": 1.9668680847356735e-08,
3039
+ "loss": 0.4871,
3040
+ "step": 433
3041
+ },
3042
+ {
3043
+ "epoch": 0.9954128440366973,
3044
+ "grad_norm": 3.7353949546813965,
3045
+ "learning_rate": 8.741954362678772e-09,
3046
+ "loss": 0.5689,
3047
+ "step": 434
3048
+ },
3049
+ {
3050
+ "epoch": 0.9977064220183486,
3051
+ "grad_norm": 3.645770788192749,
3052
+ "learning_rate": 2.185536356363871e-09,
3053
+ "loss": 0.6416,
3054
+ "step": 435
3055
+ },
3056
+ {
3057
+ "epoch": 1.0,
3058
+ "grad_norm": 4.725067615509033,
3059
+ "learning_rate": 0.0,
3060
+ "loss": 0.7461,
3061
+ "step": 436
3062
  }
3063
  ],
3064
  "logging_steps": 1,
 
3073
  "should_evaluate": false,
3074
  "should_log": false,
3075
  "should_save": true,
3076
+ "should_training_stop": true
3077
  },
3078
  "attributes": {}
3079
  }
3080
  },
3081
+ "total_flos": 2.0262676808073216e+16,
3082
  "train_batch_size": 4,
3083
  "trial_name": null,
3084
  "trial_params": null