Training in progress, step 2825
Browse files- adapter_model.safetensors +1 -1
- train.log +120 -0
adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1140991056
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0a54bf4b2daf665f7187e1e4a2e0253f97194901c1f5c3704ab843df6d5fb57f
|
3 |
size 1140991056
|
train.log
CHANGED
@@ -1845,3 +1845,123 @@ Time to load cpu_adam op: 2.392569065093994 seconds
|
|
1845 |
|
1846 |
[Rank 0] Trainer log: {'loss': 0.8147, 'grad_norm': 3.9575250148773193, 'learning_rate': 8.266503002457191e-06}
|
1847 |
{'loss': 0.8147, 'grad_norm': 3.9575250148773193, 'learning_rate': 8.266503002457191e-06, 'epoch': 0.58}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1845 |
|
1846 |
[Rank 0] Trainer log: {'loss': 0.8147, 'grad_norm': 3.9575250148773193, 'learning_rate': 8.266503002457191e-06}
|
1847 |
{'loss': 0.8147, 'grad_norm': 3.9575250148773193, 'learning_rate': 8.266503002457191e-06, 'epoch': 0.58}
|
1848 |
+
[Rank 2] Trainer log: {'loss': 0.754, 'grad_norm': 5.0994720458984375, 'learning_rate': 8.259787473755625e-06}[Rank 1] Trainer log: {'loss': 0.754, 'grad_norm': 5.0994720458984375, 'learning_rate': 8.259787473755625e-06}[Rank 3] Trainer log: {'loss': 0.754, 'grad_norm': 5.0994720458984375, 'learning_rate': 8.259787473755625e-06}
|
1849 |
+
|
1850 |
+
|
1851 |
+
[Rank 0] Trainer log: {'loss': 0.754, 'grad_norm': 5.0994720458984375, 'learning_rate': 8.259787473755625e-06}
|
1852 |
+
{'loss': 0.754, 'grad_norm': 5.0994720458984375, 'learning_rate': 8.259787473755625e-06, 'epoch': 0.58}
|
1853 |
+
[Rank 2] Trainer log: {'loss': 0.7934, 'grad_norm': 2.9868428707122803, 'learning_rate': 8.253072754272176e-06}[Rank 1] Trainer log: {'loss': 0.7934, 'grad_norm': 2.9868428707122803, 'learning_rate': 8.253072754272176e-06}[Rank 3] Trainer log: {'loss': 0.7934, 'grad_norm': 2.9868428707122803, 'learning_rate': 8.253072754272176e-06}
|
1854 |
+
|
1855 |
+
|
1856 |
+
[Rank 0] Trainer log: {'loss': 0.7934, 'grad_norm': 2.9868428707122803, 'learning_rate': 8.253072754272176e-06}
|
1857 |
+
{'loss': 0.7934, 'grad_norm': 2.9868428707122803, 'learning_rate': 8.253072754272176e-06, 'epoch': 0.58}
|
1858 |
+
[Rank 0] Trainer log: {'loss': 0.8424, 'grad_norm': 2.291961431503296, 'learning_rate': 8.246358847129256e-06}[Rank 2] Trainer log: {'loss': 0.8424, 'grad_norm': 2.291961431503296, 'learning_rate': 8.246358847129256e-06}
|
1859 |
+
[Rank 3] Trainer log: {'loss': 0.8424, 'grad_norm': 2.291961431503296, 'learning_rate': 8.246358847129256e-06}
|
1860 |
+
|
1861 |
+
[Rank 1] Trainer log: {'loss': 0.8424, 'grad_norm': 2.291961431503296, 'learning_rate': 8.246358847129256e-06}
|
1862 |
+
{'loss': 0.8424, 'grad_norm': 2.291961431503296, 'learning_rate': 8.246358847129256e-06, 'epoch': 0.58}
|
1863 |
+
[Rank 2] Trainer log: {'loss': 1.0298, 'grad_norm': 7.284506797790527, 'learning_rate': 8.239645755448905e-06}
|
1864 |
+
[Rank 3] Trainer log: {'loss': 1.0298, 'grad_norm': 7.284506797790527, 'learning_rate': 8.239645755448905e-06}[Rank 0] Trainer log: {'loss': 1.0298, 'grad_norm': 7.284506797790527, 'learning_rate': 8.239645755448905e-06}
|
1865 |
+
[Rank 1] Trainer log: {'loss': 1.0298, 'grad_norm': 7.284506797790527, 'learning_rate': 8.239645755448905e-06}
|
1866 |
+
|
1867 |
+
{'loss': 1.0298, 'grad_norm': 7.284506797790527, 'learning_rate': 8.239645755448905e-06, 'epoch': 0.58}
|
1868 |
+
[Rank 1] Trainer log: {'loss': 1.035, 'grad_norm': 2.5546376705169678, 'learning_rate': 8.232933482352792e-06}[Rank 0] Trainer log: {'loss': 1.035, 'grad_norm': 2.5546376705169678, 'learning_rate': 8.232933482352792e-06}
|
1869 |
+
[Rank 3] Trainer log: {'loss': 1.035, 'grad_norm': 2.5546376705169678, 'learning_rate': 8.232933482352792e-06}
|
1870 |
+
[Rank 2] Trainer log: {'loss': 1.035, 'grad_norm': 2.5546376705169678, 'learning_rate': 8.232933482352792e-06}
|
1871 |
+
|
1872 |
+
{'loss': 1.035, 'grad_norm': 2.5546376705169678, 'learning_rate': 8.232933482352792e-06, 'epoch': 0.58}
|
1873 |
+
[Rank 2] Trainer log: {'loss': 0.8326, 'grad_norm': 2.999725818634033, 'learning_rate': 8.226222030962193e-06}[Rank 3] Trainer log: {'loss': 0.8326, 'grad_norm': 2.999725818634033, 'learning_rate': 8.226222030962193e-06}
|
1874 |
+
|
1875 |
+
[Rank 0] Trainer log: {'loss': 0.8326, 'grad_norm': 2.999725818634033, 'learning_rate': 8.226222030962193e-06}[Rank 1] Trainer log: {'loss': 0.8326, 'grad_norm': 2.999725818634033, 'learning_rate': 8.226222030962193e-06}
|
1876 |
+
|
1877 |
+
{'loss': 0.8326, 'grad_norm': 2.999725818634033, 'learning_rate': 8.226222030962193e-06, 'epoch': 0.58}
|
1878 |
+
[Rank 2] Trainer log: {'loss': 0.7464, 'grad_norm': 10.243378639221191, 'learning_rate': 8.219511404398008e-06}[Rank 3] Trainer log: {'loss': 0.7464, 'grad_norm': 10.243378639221191, 'learning_rate': 8.219511404398008e-06}[Rank 1] Trainer log: {'loss': 0.7464, 'grad_norm': 10.243378639221191, 'learning_rate': 8.219511404398008e-06}
|
1879 |
+
|
1880 |
+
|
1881 |
+
[Rank 0] Trainer log: {'loss': 0.7464, 'grad_norm': 10.243378639221191, 'learning_rate': 8.219511404398008e-06}
|
1882 |
+
{'loss': 0.7464, 'grad_norm': 10.243378639221191, 'learning_rate': 8.219511404398008e-06, 'epoch': 0.58}
|
1883 |
+
[Rank 3] Trainer log: {'loss': 0.6977, 'grad_norm': 6.754606246948242, 'learning_rate': 8.212801605780754e-06}[Rank 2] Trainer log: {'loss': 0.6977, 'grad_norm': 6.754606246948242, 'learning_rate': 8.212801605780754e-06}
|
1884 |
+
|
1885 |
+
[Rank 0] Trainer log: {'loss': 0.6977, 'grad_norm': 6.754606246948242, 'learning_rate': 8.212801605780754e-06}[Rank 1] Trainer log: {'loss': 0.6977, 'grad_norm': 6.754606246948242, 'learning_rate': 8.212801605780754e-06}
|
1886 |
+
|
1887 |
+
{'loss': 0.6977, 'grad_norm': 6.754606246948242, 'learning_rate': 8.212801605780754e-06, 'epoch': 0.58}
|
1888 |
+
[Rank 3] Trainer log: {'loss': 0.8488, 'grad_norm': 4.681671619415283, 'learning_rate': 8.206092638230561e-06}
|
1889 |
+
[Rank 1] Trainer log: {'loss': 0.8488, 'grad_norm': 4.681671619415283, 'learning_rate': 8.206092638230561e-06}[Rank 0] Trainer log: {'loss': 0.8488, 'grad_norm': 4.681671619415283, 'learning_rate': 8.206092638230561e-06}
|
1890 |
+
[Rank 2] Trainer log: {'loss': 0.8488, 'grad_norm': 4.681671619415283, 'learning_rate': 8.206092638230561e-06}
|
1891 |
+
|
1892 |
+
{'loss': 0.8488, 'grad_norm': 4.681671619415283, 'learning_rate': 8.206092638230561e-06, 'epoch': 0.58}
|
1893 |
+
[Rank 2] Trainer log: {'loss': 0.7159, 'grad_norm': 3.213949203491211, 'learning_rate': 8.199384504867172e-06}[Rank 3] Trainer log: {'loss': 0.7159, 'grad_norm': 3.213949203491211, 'learning_rate': 8.199384504867172e-06}
|
1894 |
+
|
1895 |
+
[Rank 0] Trainer log: {'loss': 0.7159, 'grad_norm': 3.213949203491211, 'learning_rate': 8.199384504867172e-06}
|
1896 |
+
[Rank 1] Trainer log: {'loss': 0.7159, 'grad_norm': 3.213949203491211, 'learning_rate': 8.199384504867172e-06}
|
1897 |
+
{'loss': 0.7159, 'grad_norm': 3.213949203491211, 'learning_rate': 8.199384504867172e-06, 'epoch': 0.58}
|
1898 |
+
[Rank 0] Trainer log: {'loss': 0.5498, 'grad_norm': 3.209015130996704, 'learning_rate': 8.192677208809945e-06}[Rank 1] Trainer log: {'loss': 0.5498, 'grad_norm': 3.209015130996704, 'learning_rate': 8.192677208809945e-06}[Rank 2] Trainer log: {'loss': 0.5498, 'grad_norm': 3.209015130996704, 'learning_rate': 8.192677208809945e-06}
|
1899 |
+
|
1900 |
+
[Rank 3] Trainer log: {'loss': 0.5498, 'grad_norm': 3.209015130996704, 'learning_rate': 8.192677208809945e-06}
|
1901 |
+
|
1902 |
+
{'loss': 0.5498, 'grad_norm': 3.209015130996704, 'learning_rate': 8.192677208809945e-06, 'epoch': 0.58}
|
1903 |
+
[Rank 1] Trainer log: {'loss': 0.8484, 'grad_norm': 6.233098983764648, 'learning_rate': 8.18597075317785e-06}[Rank 2] Trainer log: {'loss': 0.8484, 'grad_norm': 6.233098983764648, 'learning_rate': 8.18597075317785e-06}[Rank 3] Trainer log: {'loss': 0.8484, 'grad_norm': 6.233098983764648, 'learning_rate': 8.18597075317785e-06}
|
1904 |
+
|
1905 |
+
|
1906 |
+
[Rank 0] Trainer log: {'loss': 0.8484, 'grad_norm': 6.233098983764648, 'learning_rate': 8.18597075317785e-06}
|
1907 |
+
{'loss': 0.8484, 'grad_norm': 6.233098983764648, 'learning_rate': 8.18597075317785e-06, 'epoch': 0.58}
|
1908 |
+
[Rank 1] Trainer log: {'loss': 0.8498, 'grad_norm': 3.9201512336730957, 'learning_rate': 8.179265141089455e-06}[Rank 2] Trainer log: {'loss': 0.8498, 'grad_norm': 3.9201512336730957, 'learning_rate': 8.179265141089455e-06}[Rank 0] Trainer log: {'loss': 0.8498, 'grad_norm': 3.9201512336730957, 'learning_rate': 8.179265141089455e-06}
|
1909 |
+
|
1910 |
+
[Rank 3] Trainer log: {'loss': 0.8498, 'grad_norm': 3.9201512336730957, 'learning_rate': 8.179265141089455e-06}
|
1911 |
+
|
1912 |
+
{'loss': 0.8498, 'grad_norm': 3.9201512336730957, 'learning_rate': 8.179265141089455e-06, 'epoch': 0.58}
|
1913 |
+
[Rank 2] Trainer log: {'loss': 0.7884, 'grad_norm': 4.692569255828857, 'learning_rate': 8.172560375662953e-06}
|
1914 |
+
[Rank 3] Trainer log: {'loss': 0.7884, 'grad_norm': 4.692569255828857, 'learning_rate': 8.172560375662953e-06}
|
1915 |
+
[Rank 1] Trainer log: {'loss': 0.7884, 'grad_norm': 4.692569255828857, 'learning_rate': 8.172560375662953e-06}
|
1916 |
+
[Rank 0] Trainer log: {'loss': 0.7884, 'grad_norm': 4.692569255828857, 'learning_rate': 8.172560375662953e-06}
|
1917 |
+
{'loss': 0.7884, 'grad_norm': 4.692569255828857, 'learning_rate': 8.172560375662953e-06, 'epoch': 0.58}
|
1918 |
+
[Rank 1] Trainer log: {'loss': 0.9693, 'grad_norm': 2.1812329292297363, 'learning_rate': 8.165856460016128e-06}[Rank 3] Trainer log: {'loss': 0.9693, 'grad_norm': 2.1812329292297363, 'learning_rate': 8.165856460016128e-06}[Rank 2] Trainer log: {'loss': 0.9693, 'grad_norm': 2.1812329292297363, 'learning_rate': 8.165856460016128e-06}
|
1919 |
+
|
1920 |
+
|
1921 |
+
[Rank 0] Trainer log: {'loss': 0.9693, 'grad_norm': 2.1812329292297363, 'learning_rate': 8.165856460016128e-06}
|
1922 |
+
{'loss': 0.9693, 'grad_norm': 2.1812329292297363, 'learning_rate': 8.165856460016128e-06, 'epoch': 0.58}
|
1923 |
+
[Rank 3] Trainer log: {'loss': 0.8143, 'grad_norm': 9.283940315246582, 'learning_rate': 8.159153397266377e-06}[Rank 1] Trainer log: {'loss': 0.8143, 'grad_norm': 9.283940315246582, 'learning_rate': 8.159153397266377e-06}[Rank 2] Trainer log: {'loss': 0.8143, 'grad_norm': 9.283940315246582, 'learning_rate': 8.159153397266377e-06}
|
1924 |
+
|
1925 |
+
|
1926 |
+
[Rank 0] Trainer log: {'loss': 0.8143, 'grad_norm': 9.283940315246582, 'learning_rate': 8.159153397266377e-06}
|
1927 |
+
{'loss': 0.8143, 'grad_norm': 9.283940315246582, 'learning_rate': 8.159153397266377e-06, 'epoch': 0.58}
|
1928 |
+
[Rank 2] Trainer log: {'loss': 0.8269, 'grad_norm': 5.904989719390869, 'learning_rate': 8.1524511905307e-06}[Rank 3] Trainer log: {'loss': 0.8269, 'grad_norm': 5.904989719390869, 'learning_rate': 8.1524511905307e-06}[Rank 1] Trainer log: {'loss': 0.8269, 'grad_norm': 5.904989719390869, 'learning_rate': 8.1524511905307e-06}
|
1929 |
+
|
1930 |
+
|
1931 |
+
[Rank 0] Trainer log: {'loss': 0.8269, 'grad_norm': 5.904989719390869, 'learning_rate': 8.1524511905307e-06}
|
1932 |
+
{'loss': 0.8269, 'grad_norm': 5.904989719390869, 'learning_rate': 8.1524511905307e-06, 'epoch': 0.58}
|
1933 |
+
[Rank 1] Trainer log: {'loss': 0.6376, 'grad_norm': 13.88896656036377, 'learning_rate': 8.145749842925698e-06}[Rank 3] Trainer log: {'loss': 0.6376, 'grad_norm': 13.88896656036377, 'learning_rate': 8.145749842925698e-06}[Rank 2] Trainer log: {'loss': 0.6376, 'grad_norm': 13.88896656036377, 'learning_rate': 8.145749842925698e-06}
|
1934 |
+
|
1935 |
+
|
1936 |
+
[Rank 0] Trainer log: {'loss': 0.6376, 'grad_norm': 13.88896656036377, 'learning_rate': 8.145749842925698e-06}
|
1937 |
+
{'loss': 0.6376, 'grad_norm': 13.88896656036377, 'learning_rate': 8.145749842925698e-06, 'epoch': 0.58}
|
1938 |
+
[Rank 0] Trainer log: {'loss': 1.0185, 'grad_norm': 2.079887628555298, 'learning_rate': 8.139049357567564e-06}[Rank 1] Trainer log: {'loss': 1.0185, 'grad_norm': 2.079887628555298, 'learning_rate': 8.139049357567564e-06}[Rank 3] Trainer log: {'loss': 1.0185, 'grad_norm': 2.079887628555298, 'learning_rate': 8.139049357567564e-06}
|
1939 |
+
|
1940 |
+
[Rank 2] Trainer log: {'loss': 1.0185, 'grad_norm': 2.079887628555298, 'learning_rate': 8.139049357567564e-06}
|
1941 |
+
|
1942 |
+
{'loss': 1.0185, 'grad_norm': 2.079887628555298, 'learning_rate': 8.139049357567564e-06, 'epoch': 0.58}
|
1943 |
+
[Rank 2] Trainer log: {'loss': 0.9039, 'grad_norm': 6.526498794555664, 'learning_rate': 8.132349737572107e-06}[Rank 1] Trainer log: {'loss': 0.9039, 'grad_norm': 6.526498794555664, 'learning_rate': 8.132349737572107e-06}[Rank 3] Trainer log: {'loss': 0.9039, 'grad_norm': 6.526498794555664, 'learning_rate': 8.132349737572107e-06}
|
1944 |
+
|
1945 |
+
|
1946 |
+
[Rank 0] Trainer log: {'loss': 0.9039, 'grad_norm': 6.526498794555664, 'learning_rate': 8.132349737572107e-06}
|
1947 |
+
{'loss': 0.9039, 'grad_norm': 6.526498794555664, 'learning_rate': 8.132349737572107e-06, 'epoch': 0.58}
|
1948 |
+
[Rank 2] Trainer log: {'loss': 0.9238, 'grad_norm': 6.45925235748291, 'learning_rate': 8.125650986054726e-06}[Rank 3] Trainer log: {'loss': 0.9238, 'grad_norm': 6.45925235748291, 'learning_rate': 8.125650986054726e-06}
|
1949 |
+
[Rank 1] Trainer log: {'loss': 0.9238, 'grad_norm': 6.45925235748291, 'learning_rate': 8.125650986054726e-06}
|
1950 |
+
|
1951 |
+
[Rank 0] Trainer log: {'loss': 0.9238, 'grad_norm': 6.45925235748291, 'learning_rate': 8.125650986054726e-06}
|
1952 |
+
{'loss': 0.9238, 'grad_norm': 6.45925235748291, 'learning_rate': 8.125650986054726e-06, 'epoch': 0.58}
|
1953 |
+
[Rank 1] Trainer log: {'loss': 0.9597, 'grad_norm': 2.490858316421509, 'learning_rate': 8.118953106130405e-06}[Rank 3] Trainer log: {'loss': 0.9597, 'grad_norm': 2.490858316421509, 'learning_rate': 8.118953106130405e-06}[Rank 0] Trainer log: {'loss': 0.9597, 'grad_norm': 2.490858316421509, 'learning_rate': 8.118953106130405e-06}
|
1954 |
+
|
1955 |
+
[Rank 2] Trainer log: {'loss': 0.9597, 'grad_norm': 2.490858316421509, 'learning_rate': 8.118953106130405e-06}
|
1956 |
+
|
1957 |
+
{'loss': 0.9597, 'grad_norm': 2.490858316421509, 'learning_rate': 8.118953106130405e-06, 'epoch': 0.58}
|
1958 |
+
[Rank 0] Trainer log: {'loss': 0.9447, 'grad_norm': 5.7371721267700195, 'learning_rate': 8.112256100913738e-06}[Rank 1] Trainer log: {'loss': 0.9447, 'grad_norm': 5.7371721267700195, 'learning_rate': 8.112256100913738e-06}
|
1959 |
+
[Rank 2] Trainer log: {'loss': 0.9447, 'grad_norm': 5.7371721267700195, 'learning_rate': 8.112256100913738e-06}
|
1960 |
+
[Rank 3] Trainer log: {'loss': 0.9447, 'grad_norm': 5.7371721267700195, 'learning_rate': 8.112256100913738e-06}
|
1961 |
+
|
1962 |
+
{'loss': 0.9447, 'grad_norm': 5.7371721267700195, 'learning_rate': 8.112256100913738e-06, 'epoch': 0.58}
|
1963 |
+
[Rank 0] Trainer log: {'loss': 0.8225, 'grad_norm': 4.1787800788879395, 'learning_rate': 8.105559973518905e-06}[Rank 2] Trainer log: {'loss': 0.8225, 'grad_norm': 4.1787800788879395, 'learning_rate': 8.105559973518905e-06}[Rank 1] Trainer log: {'loss': 0.8225, 'grad_norm': 4.1787800788879395, 'learning_rate': 8.105559973518905e-06}
|
1964 |
+
|
1965 |
+
[Rank 3] Trainer log: {'loss': 0.8225, 'grad_norm': 4.1787800788879395, 'learning_rate': 8.105559973518905e-06}
|
1966 |
+
|
1967 |
+
{'loss': 0.8225, 'grad_norm': 4.1787800788879395, 'learning_rate': 8.105559973518905e-06, 'epoch': 0.58}
|