aiden200 commited on
Commit
a978430
·
verified ·
1 Parent(s): 61a0272

Training in progress, step 2825

Browse files
Files changed (2) hide show
  1. adapter_model.safetensors +1 -1
  2. train.log +120 -0
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4da18afe65af7dcd1725167031fac839af3bb5de549fe5b103074d375135fff9
3
  size 1140991056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a54bf4b2daf665f7187e1e4a2e0253f97194901c1f5c3704ab843df6d5fb57f
3
  size 1140991056
train.log CHANGED
@@ -1845,3 +1845,123 @@ Time to load cpu_adam op: 2.392569065093994 seconds
1845
 
1846
  [Rank 0] Trainer log: {'loss': 0.8147, 'grad_norm': 3.9575250148773193, 'learning_rate': 8.266503002457191e-06}
1847
  {'loss': 0.8147, 'grad_norm': 3.9575250148773193, 'learning_rate': 8.266503002457191e-06, 'epoch': 0.58}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1845
 
1846
  [Rank 0] Trainer log: {'loss': 0.8147, 'grad_norm': 3.9575250148773193, 'learning_rate': 8.266503002457191e-06}
1847
  {'loss': 0.8147, 'grad_norm': 3.9575250148773193, 'learning_rate': 8.266503002457191e-06, 'epoch': 0.58}
1848
+ [Rank 2] Trainer log: {'loss': 0.754, 'grad_norm': 5.0994720458984375, 'learning_rate': 8.259787473755625e-06}[Rank 1] Trainer log: {'loss': 0.754, 'grad_norm': 5.0994720458984375, 'learning_rate': 8.259787473755625e-06}[Rank 3] Trainer log: {'loss': 0.754, 'grad_norm': 5.0994720458984375, 'learning_rate': 8.259787473755625e-06}
1849
+
1850
+
1851
+ [Rank 0] Trainer log: {'loss': 0.754, 'grad_norm': 5.0994720458984375, 'learning_rate': 8.259787473755625e-06}
1852
+ {'loss': 0.754, 'grad_norm': 5.0994720458984375, 'learning_rate': 8.259787473755625e-06, 'epoch': 0.58}
1853
+ [Rank 2] Trainer log: {'loss': 0.7934, 'grad_norm': 2.9868428707122803, 'learning_rate': 8.253072754272176e-06}[Rank 1] Trainer log: {'loss': 0.7934, 'grad_norm': 2.9868428707122803, 'learning_rate': 8.253072754272176e-06}[Rank 3] Trainer log: {'loss': 0.7934, 'grad_norm': 2.9868428707122803, 'learning_rate': 8.253072754272176e-06}
1854
+
1855
+
1856
+ [Rank 0] Trainer log: {'loss': 0.7934, 'grad_norm': 2.9868428707122803, 'learning_rate': 8.253072754272176e-06}
1857
+ {'loss': 0.7934, 'grad_norm': 2.9868428707122803, 'learning_rate': 8.253072754272176e-06, 'epoch': 0.58}
1858
+ [Rank 0] Trainer log: {'loss': 0.8424, 'grad_norm': 2.291961431503296, 'learning_rate': 8.246358847129256e-06}[Rank 2] Trainer log: {'loss': 0.8424, 'grad_norm': 2.291961431503296, 'learning_rate': 8.246358847129256e-06}
1859
+ [Rank 3] Trainer log: {'loss': 0.8424, 'grad_norm': 2.291961431503296, 'learning_rate': 8.246358847129256e-06}
1860
+
1861
+ [Rank 1] Trainer log: {'loss': 0.8424, 'grad_norm': 2.291961431503296, 'learning_rate': 8.246358847129256e-06}
1862
+ {'loss': 0.8424, 'grad_norm': 2.291961431503296, 'learning_rate': 8.246358847129256e-06, 'epoch': 0.58}
1863
+ [Rank 2] Trainer log: {'loss': 1.0298, 'grad_norm': 7.284506797790527, 'learning_rate': 8.239645755448905e-06}
1864
+ [Rank 3] Trainer log: {'loss': 1.0298, 'grad_norm': 7.284506797790527, 'learning_rate': 8.239645755448905e-06}[Rank 0] Trainer log: {'loss': 1.0298, 'grad_norm': 7.284506797790527, 'learning_rate': 8.239645755448905e-06}
1865
+ [Rank 1] Trainer log: {'loss': 1.0298, 'grad_norm': 7.284506797790527, 'learning_rate': 8.239645755448905e-06}
1866
+
1867
+ {'loss': 1.0298, 'grad_norm': 7.284506797790527, 'learning_rate': 8.239645755448905e-06, 'epoch': 0.58}
1868
+ [Rank 1] Trainer log: {'loss': 1.035, 'grad_norm': 2.5546376705169678, 'learning_rate': 8.232933482352792e-06}[Rank 0] Trainer log: {'loss': 1.035, 'grad_norm': 2.5546376705169678, 'learning_rate': 8.232933482352792e-06}
1869
+ [Rank 3] Trainer log: {'loss': 1.035, 'grad_norm': 2.5546376705169678, 'learning_rate': 8.232933482352792e-06}
1870
+ [Rank 2] Trainer log: {'loss': 1.035, 'grad_norm': 2.5546376705169678, 'learning_rate': 8.232933482352792e-06}
1871
+
1872
+ {'loss': 1.035, 'grad_norm': 2.5546376705169678, 'learning_rate': 8.232933482352792e-06, 'epoch': 0.58}
1873
+ [Rank 2] Trainer log: {'loss': 0.8326, 'grad_norm': 2.999725818634033, 'learning_rate': 8.226222030962193e-06}[Rank 3] Trainer log: {'loss': 0.8326, 'grad_norm': 2.999725818634033, 'learning_rate': 8.226222030962193e-06}
1874
+
1875
+ [Rank 0] Trainer log: {'loss': 0.8326, 'grad_norm': 2.999725818634033, 'learning_rate': 8.226222030962193e-06}[Rank 1] Trainer log: {'loss': 0.8326, 'grad_norm': 2.999725818634033, 'learning_rate': 8.226222030962193e-06}
1876
+
1877
+ {'loss': 0.8326, 'grad_norm': 2.999725818634033, 'learning_rate': 8.226222030962193e-06, 'epoch': 0.58}
1878
+ [Rank 2] Trainer log: {'loss': 0.7464, 'grad_norm': 10.243378639221191, 'learning_rate': 8.219511404398008e-06}[Rank 3] Trainer log: {'loss': 0.7464, 'grad_norm': 10.243378639221191, 'learning_rate': 8.219511404398008e-06}[Rank 1] Trainer log: {'loss': 0.7464, 'grad_norm': 10.243378639221191, 'learning_rate': 8.219511404398008e-06}
1879
+
1880
+
1881
+ [Rank 0] Trainer log: {'loss': 0.7464, 'grad_norm': 10.243378639221191, 'learning_rate': 8.219511404398008e-06}
1882
+ {'loss': 0.7464, 'grad_norm': 10.243378639221191, 'learning_rate': 8.219511404398008e-06, 'epoch': 0.58}
1883
+ [Rank 3] Trainer log: {'loss': 0.6977, 'grad_norm': 6.754606246948242, 'learning_rate': 8.212801605780754e-06}[Rank 2] Trainer log: {'loss': 0.6977, 'grad_norm': 6.754606246948242, 'learning_rate': 8.212801605780754e-06}
1884
+
1885
+ [Rank 0] Trainer log: {'loss': 0.6977, 'grad_norm': 6.754606246948242, 'learning_rate': 8.212801605780754e-06}[Rank 1] Trainer log: {'loss': 0.6977, 'grad_norm': 6.754606246948242, 'learning_rate': 8.212801605780754e-06}
1886
+
1887
+ {'loss': 0.6977, 'grad_norm': 6.754606246948242, 'learning_rate': 8.212801605780754e-06, 'epoch': 0.58}
1888
+ [Rank 3] Trainer log: {'loss': 0.8488, 'grad_norm': 4.681671619415283, 'learning_rate': 8.206092638230561e-06}
1889
+ [Rank 1] Trainer log: {'loss': 0.8488, 'grad_norm': 4.681671619415283, 'learning_rate': 8.206092638230561e-06}[Rank 0] Trainer log: {'loss': 0.8488, 'grad_norm': 4.681671619415283, 'learning_rate': 8.206092638230561e-06}
1890
+ [Rank 2] Trainer log: {'loss': 0.8488, 'grad_norm': 4.681671619415283, 'learning_rate': 8.206092638230561e-06}
1891
+
1892
+ {'loss': 0.8488, 'grad_norm': 4.681671619415283, 'learning_rate': 8.206092638230561e-06, 'epoch': 0.58}
1893
+ [Rank 2] Trainer log: {'loss': 0.7159, 'grad_norm': 3.213949203491211, 'learning_rate': 8.199384504867172e-06}[Rank 3] Trainer log: {'loss': 0.7159, 'grad_norm': 3.213949203491211, 'learning_rate': 8.199384504867172e-06}
1894
+
1895
+ [Rank 0] Trainer log: {'loss': 0.7159, 'grad_norm': 3.213949203491211, 'learning_rate': 8.199384504867172e-06}
1896
+ [Rank 1] Trainer log: {'loss': 0.7159, 'grad_norm': 3.213949203491211, 'learning_rate': 8.199384504867172e-06}
1897
+ {'loss': 0.7159, 'grad_norm': 3.213949203491211, 'learning_rate': 8.199384504867172e-06, 'epoch': 0.58}
1898
+ [Rank 0] Trainer log: {'loss': 0.5498, 'grad_norm': 3.209015130996704, 'learning_rate': 8.192677208809945e-06}[Rank 1] Trainer log: {'loss': 0.5498, 'grad_norm': 3.209015130996704, 'learning_rate': 8.192677208809945e-06}[Rank 2] Trainer log: {'loss': 0.5498, 'grad_norm': 3.209015130996704, 'learning_rate': 8.192677208809945e-06}
1899
+
1900
+ [Rank 3] Trainer log: {'loss': 0.5498, 'grad_norm': 3.209015130996704, 'learning_rate': 8.192677208809945e-06}
1901
+
1902
+ {'loss': 0.5498, 'grad_norm': 3.209015130996704, 'learning_rate': 8.192677208809945e-06, 'epoch': 0.58}
1903
+ [Rank 1] Trainer log: {'loss': 0.8484, 'grad_norm': 6.233098983764648, 'learning_rate': 8.18597075317785e-06}[Rank 2] Trainer log: {'loss': 0.8484, 'grad_norm': 6.233098983764648, 'learning_rate': 8.18597075317785e-06}[Rank 3] Trainer log: {'loss': 0.8484, 'grad_norm': 6.233098983764648, 'learning_rate': 8.18597075317785e-06}
1904
+
1905
+
1906
+ [Rank 0] Trainer log: {'loss': 0.8484, 'grad_norm': 6.233098983764648, 'learning_rate': 8.18597075317785e-06}
1907
+ {'loss': 0.8484, 'grad_norm': 6.233098983764648, 'learning_rate': 8.18597075317785e-06, 'epoch': 0.58}
1908
+ [Rank 1] Trainer log: {'loss': 0.8498, 'grad_norm': 3.9201512336730957, 'learning_rate': 8.179265141089455e-06}[Rank 2] Trainer log: {'loss': 0.8498, 'grad_norm': 3.9201512336730957, 'learning_rate': 8.179265141089455e-06}[Rank 0] Trainer log: {'loss': 0.8498, 'grad_norm': 3.9201512336730957, 'learning_rate': 8.179265141089455e-06}
1909
+
1910
+ [Rank 3] Trainer log: {'loss': 0.8498, 'grad_norm': 3.9201512336730957, 'learning_rate': 8.179265141089455e-06}
1911
+
1912
+ {'loss': 0.8498, 'grad_norm': 3.9201512336730957, 'learning_rate': 8.179265141089455e-06, 'epoch': 0.58}
1913
+ [Rank 2] Trainer log: {'loss': 0.7884, 'grad_norm': 4.692569255828857, 'learning_rate': 8.172560375662953e-06}
1914
+ [Rank 3] Trainer log: {'loss': 0.7884, 'grad_norm': 4.692569255828857, 'learning_rate': 8.172560375662953e-06}
1915
+ [Rank 1] Trainer log: {'loss': 0.7884, 'grad_norm': 4.692569255828857, 'learning_rate': 8.172560375662953e-06}
1916
+ [Rank 0] Trainer log: {'loss': 0.7884, 'grad_norm': 4.692569255828857, 'learning_rate': 8.172560375662953e-06}
1917
+ {'loss': 0.7884, 'grad_norm': 4.692569255828857, 'learning_rate': 8.172560375662953e-06, 'epoch': 0.58}
1918
+ [Rank 1] Trainer log: {'loss': 0.9693, 'grad_norm': 2.1812329292297363, 'learning_rate': 8.165856460016128e-06}[Rank 3] Trainer log: {'loss': 0.9693, 'grad_norm': 2.1812329292297363, 'learning_rate': 8.165856460016128e-06}[Rank 2] Trainer log: {'loss': 0.9693, 'grad_norm': 2.1812329292297363, 'learning_rate': 8.165856460016128e-06}
1919
+
1920
+
1921
+ [Rank 0] Trainer log: {'loss': 0.9693, 'grad_norm': 2.1812329292297363, 'learning_rate': 8.165856460016128e-06}
1922
+ {'loss': 0.9693, 'grad_norm': 2.1812329292297363, 'learning_rate': 8.165856460016128e-06, 'epoch': 0.58}
1923
+ [Rank 3] Trainer log: {'loss': 0.8143, 'grad_norm': 9.283940315246582, 'learning_rate': 8.159153397266377e-06}[Rank 1] Trainer log: {'loss': 0.8143, 'grad_norm': 9.283940315246582, 'learning_rate': 8.159153397266377e-06}[Rank 2] Trainer log: {'loss': 0.8143, 'grad_norm': 9.283940315246582, 'learning_rate': 8.159153397266377e-06}
1924
+
1925
+
1926
+ [Rank 0] Trainer log: {'loss': 0.8143, 'grad_norm': 9.283940315246582, 'learning_rate': 8.159153397266377e-06}
1927
+ {'loss': 0.8143, 'grad_norm': 9.283940315246582, 'learning_rate': 8.159153397266377e-06, 'epoch': 0.58}
1928
+ [Rank 2] Trainer log: {'loss': 0.8269, 'grad_norm': 5.904989719390869, 'learning_rate': 8.1524511905307e-06}[Rank 3] Trainer log: {'loss': 0.8269, 'grad_norm': 5.904989719390869, 'learning_rate': 8.1524511905307e-06}[Rank 1] Trainer log: {'loss': 0.8269, 'grad_norm': 5.904989719390869, 'learning_rate': 8.1524511905307e-06}
1929
+
1930
+
1931
+ [Rank 0] Trainer log: {'loss': 0.8269, 'grad_norm': 5.904989719390869, 'learning_rate': 8.1524511905307e-06}
1932
+ {'loss': 0.8269, 'grad_norm': 5.904989719390869, 'learning_rate': 8.1524511905307e-06, 'epoch': 0.58}
1933
+ [Rank 1] Trainer log: {'loss': 0.6376, 'grad_norm': 13.88896656036377, 'learning_rate': 8.145749842925698e-06}[Rank 3] Trainer log: {'loss': 0.6376, 'grad_norm': 13.88896656036377, 'learning_rate': 8.145749842925698e-06}[Rank 2] Trainer log: {'loss': 0.6376, 'grad_norm': 13.88896656036377, 'learning_rate': 8.145749842925698e-06}
1934
+
1935
+
1936
+ [Rank 0] Trainer log: {'loss': 0.6376, 'grad_norm': 13.88896656036377, 'learning_rate': 8.145749842925698e-06}
1937
+ {'loss': 0.6376, 'grad_norm': 13.88896656036377, 'learning_rate': 8.145749842925698e-06, 'epoch': 0.58}
1938
+ [Rank 0] Trainer log: {'loss': 1.0185, 'grad_norm': 2.079887628555298, 'learning_rate': 8.139049357567564e-06}[Rank 1] Trainer log: {'loss': 1.0185, 'grad_norm': 2.079887628555298, 'learning_rate': 8.139049357567564e-06}[Rank 3] Trainer log: {'loss': 1.0185, 'grad_norm': 2.079887628555298, 'learning_rate': 8.139049357567564e-06}
1939
+
1940
+ [Rank 2] Trainer log: {'loss': 1.0185, 'grad_norm': 2.079887628555298, 'learning_rate': 8.139049357567564e-06}
1941
+
1942
+ {'loss': 1.0185, 'grad_norm': 2.079887628555298, 'learning_rate': 8.139049357567564e-06, 'epoch': 0.58}
1943
+ [Rank 2] Trainer log: {'loss': 0.9039, 'grad_norm': 6.526498794555664, 'learning_rate': 8.132349737572107e-06}[Rank 1] Trainer log: {'loss': 0.9039, 'grad_norm': 6.526498794555664, 'learning_rate': 8.132349737572107e-06}[Rank 3] Trainer log: {'loss': 0.9039, 'grad_norm': 6.526498794555664, 'learning_rate': 8.132349737572107e-06}
1944
+
1945
+
1946
+ [Rank 0] Trainer log: {'loss': 0.9039, 'grad_norm': 6.526498794555664, 'learning_rate': 8.132349737572107e-06}
1947
+ {'loss': 0.9039, 'grad_norm': 6.526498794555664, 'learning_rate': 8.132349737572107e-06, 'epoch': 0.58}
1948
+ [Rank 2] Trainer log: {'loss': 0.9238, 'grad_norm': 6.45925235748291, 'learning_rate': 8.125650986054726e-06}[Rank 3] Trainer log: {'loss': 0.9238, 'grad_norm': 6.45925235748291, 'learning_rate': 8.125650986054726e-06}
1949
+ [Rank 1] Trainer log: {'loss': 0.9238, 'grad_norm': 6.45925235748291, 'learning_rate': 8.125650986054726e-06}
1950
+
1951
+ [Rank 0] Trainer log: {'loss': 0.9238, 'grad_norm': 6.45925235748291, 'learning_rate': 8.125650986054726e-06}
1952
+ {'loss': 0.9238, 'grad_norm': 6.45925235748291, 'learning_rate': 8.125650986054726e-06, 'epoch': 0.58}
1953
+ [Rank 1] Trainer log: {'loss': 0.9597, 'grad_norm': 2.490858316421509, 'learning_rate': 8.118953106130405e-06}[Rank 3] Trainer log: {'loss': 0.9597, 'grad_norm': 2.490858316421509, 'learning_rate': 8.118953106130405e-06}[Rank 0] Trainer log: {'loss': 0.9597, 'grad_norm': 2.490858316421509, 'learning_rate': 8.118953106130405e-06}
1954
+
1955
+ [Rank 2] Trainer log: {'loss': 0.9597, 'grad_norm': 2.490858316421509, 'learning_rate': 8.118953106130405e-06}
1956
+
1957
+ {'loss': 0.9597, 'grad_norm': 2.490858316421509, 'learning_rate': 8.118953106130405e-06, 'epoch': 0.58}
1958
+ [Rank 0] Trainer log: {'loss': 0.9447, 'grad_norm': 5.7371721267700195, 'learning_rate': 8.112256100913738e-06}[Rank 1] Trainer log: {'loss': 0.9447, 'grad_norm': 5.7371721267700195, 'learning_rate': 8.112256100913738e-06}
1959
+ [Rank 2] Trainer log: {'loss': 0.9447, 'grad_norm': 5.7371721267700195, 'learning_rate': 8.112256100913738e-06}
1960
+ [Rank 3] Trainer log: {'loss': 0.9447, 'grad_norm': 5.7371721267700195, 'learning_rate': 8.112256100913738e-06}
1961
+
1962
+ {'loss': 0.9447, 'grad_norm': 5.7371721267700195, 'learning_rate': 8.112256100913738e-06, 'epoch': 0.58}
1963
+ [Rank 0] Trainer log: {'loss': 0.8225, 'grad_norm': 4.1787800788879395, 'learning_rate': 8.105559973518905e-06}[Rank 2] Trainer log: {'loss': 0.8225, 'grad_norm': 4.1787800788879395, 'learning_rate': 8.105559973518905e-06}[Rank 1] Trainer log: {'loss': 0.8225, 'grad_norm': 4.1787800788879395, 'learning_rate': 8.105559973518905e-06}
1964
+
1965
+ [Rank 3] Trainer log: {'loss': 0.8225, 'grad_norm': 4.1787800788879395, 'learning_rate': 8.105559973518905e-06}
1966
+
1967
+ {'loss': 0.8225, 'grad_norm': 4.1787800788879395, 'learning_rate': 8.105559973518905e-06, 'epoch': 0.58}