faridkarimli commited on
Commit
49307cd
·
verified ·
1 Parent(s): e417da6

Training in progress, epoch 65

Browse files
all_results.json CHANGED
@@ -1,12 +1,7 @@
1
  {
2
- "epoch": 70.0,
3
- "eval_accuracy": 0.7132267571670984,
4
- "eval_loss": 2.1484131813049316,
5
- "eval_runtime": 2588.9498,
6
- "eval_samples_per_second": 64.874,
7
- "eval_steps_per_second": 4.055,
8
- "train_loss": 0.08673005339085323,
9
- "train_runtime": 125919.3774,
10
- "train_samples_per_second": 373.471,
11
- "train_steps_per_second": 0.73
12
  }
 
1
  {
2
+ "epoch": 64.0,
3
+ "train_loss": 0.0,
4
+ "train_runtime": 758.3182,
5
+ "train_samples_per_second": 53155.811,
6
+ "train_steps_per_second": 207.696
 
 
 
 
 
7
  }
config.json CHANGED
@@ -31044,6 +31044,7 @@
31044
  0,
31045
  0
31046
  ],
 
31047
  "qkv_bias": true,
31048
  "torch_dtype": "float32",
31049
  "transformers_version": "4.33.3",
 
31044
  0,
31045
  0
31046
  ],
31047
+ "problem_type": "single_label_classification",
31048
  "qkv_bias": true,
31049
  "torch_dtype": "float32",
31050
  "transformers_version": "4.33.3",
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e03758fda2ac49108064401c9e84fd23ee0ab573a1764d33b3fcd3f3c9fa4663
3
  size 411298414
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b38174b9aa57c0f58305fc7409d9a19a0faf0f9b07e0c2e3366e75424596f69c
3
  size 411298414
train_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "epoch": 70.0,
3
- "train_loss": 0.08673005339085323,
4
- "train_runtime": 125919.3774,
5
- "train_samples_per_second": 373.471,
6
- "train_steps_per_second": 0.73
7
  }
 
1
  {
2
+ "epoch": 64.0,
3
+ "train_loss": 0.0,
4
+ "train_runtime": 758.3182,
5
+ "train_samples_per_second": 53155.811,
6
+ "train_steps_per_second": 207.696
7
  }
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 70.0,
5
  "eval_steps": 500,
6
- "global_step": 91910,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1737,20 +1737,1199 @@
1737
  "step": 91910
1738
  },
1739
  {
1740
- "epoch": 70.0,
1741
- "step": 91910,
1742
- "total_flos": 3.2073215097814647e+21,
1743
- "train_loss": 0.08673005339085323,
1744
- "train_runtime": 125919.3774,
1745
- "train_samples_per_second": 373.471,
1746
- "train_steps_per_second": 0.73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1747
  }
1748
  ],
1749
  "logging_steps": 500,
1750
- "max_steps": 91910,
1751
- "num_train_epochs": 70,
1752
  "save_steps": 500,
1753
- "total_flos": 3.2073215097814647e+21,
1754
  "trial_name": null,
1755
  "trial_params": null
1756
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 64.0,
5
  "eval_steps": 500,
6
+ "global_step": 168000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1737
  "step": 91910
1738
  },
1739
  {
1740
+ "epoch": 35.05,
1741
+ "learning_rate": 0.003247619047619048,
1742
+ "loss": 0.5379,
1743
+ "step": 92000
1744
+ },
1745
+ {
1746
+ "epoch": 35.24,
1747
+ "learning_rate": 0.0032380952380952383,
1748
+ "loss": 0.6972,
1749
+ "step": 92500
1750
+ },
1751
+ {
1752
+ "epoch": 35.43,
1753
+ "learning_rate": 0.0032285714285714283,
1754
+ "loss": 0.7947,
1755
+ "step": 93000
1756
+ },
1757
+ {
1758
+ "epoch": 35.62,
1759
+ "learning_rate": 0.003219047619047619,
1760
+ "loss": 0.8301,
1761
+ "step": 93500
1762
+ },
1763
+ {
1764
+ "epoch": 35.81,
1765
+ "learning_rate": 0.0032095238095238092,
1766
+ "loss": 0.8519,
1767
+ "step": 94000
1768
+ },
1769
+ {
1770
+ "epoch": 36.0,
1771
+ "learning_rate": 0.0032,
1772
+ "loss": 0.8802,
1773
+ "step": 94500
1774
+ },
1775
+ {
1776
+ "epoch": 36.0,
1777
+ "eval_accuracy": 0.6566997112321753,
1778
+ "eval_loss": 1.7587852478027344,
1779
+ "eval_runtime": 4381.2628,
1780
+ "eval_samples_per_second": 38.335,
1781
+ "eval_steps_per_second": 4.792,
1782
+ "step": 94500
1783
+ },
1784
+ {
1785
+ "epoch": 36.19,
1786
+ "learning_rate": 0.00319047619047619,
1787
+ "loss": 0.7776,
1788
+ "step": 95000
1789
+ },
1790
+ {
1791
+ "epoch": 36.38,
1792
+ "learning_rate": 0.003180952380952381,
1793
+ "loss": 0.8104,
1794
+ "step": 95500
1795
+ },
1796
+ {
1797
+ "epoch": 36.57,
1798
+ "learning_rate": 0.003171428571428571,
1799
+ "loss": 0.8249,
1800
+ "step": 96000
1801
+ },
1802
+ {
1803
+ "epoch": 36.76,
1804
+ "learning_rate": 0.003161904761904762,
1805
+ "loss": 0.8548,
1806
+ "step": 96500
1807
+ },
1808
+ {
1809
+ "epoch": 36.95,
1810
+ "learning_rate": 0.0031523809523809525,
1811
+ "loss": 0.8772,
1812
+ "step": 97000
1813
+ },
1814
+ {
1815
+ "epoch": 37.0,
1816
+ "eval_accuracy": 0.6668571938912209,
1817
+ "eval_loss": 1.6901220083236694,
1818
+ "eval_runtime": 2781.7361,
1819
+ "eval_samples_per_second": 60.378,
1820
+ "eval_steps_per_second": 7.547,
1821
+ "step": 97125
1822
+ },
1823
+ {
1824
+ "epoch": 37.14,
1825
+ "learning_rate": 0.003142857142857143,
1826
+ "loss": 0.793,
1827
+ "step": 97500
1828
+ },
1829
+ {
1830
+ "epoch": 37.33,
1831
+ "learning_rate": 0.0031333333333333335,
1832
+ "loss": 0.7985,
1833
+ "step": 98000
1834
+ },
1835
+ {
1836
+ "epoch": 37.52,
1837
+ "learning_rate": 0.003123809523809524,
1838
+ "loss": 0.8144,
1839
+ "step": 98500
1840
+ },
1841
+ {
1842
+ "epoch": 37.71,
1843
+ "learning_rate": 0.0031142857142857144,
1844
+ "loss": 0.8338,
1845
+ "step": 99000
1846
+ },
1847
+ {
1848
+ "epoch": 37.9,
1849
+ "learning_rate": 0.003104761904761905,
1850
+ "loss": 0.847,
1851
+ "step": 99500
1852
+ },
1853
+ {
1854
+ "epoch": 38.0,
1855
+ "eval_accuracy": 0.6682742401238427,
1856
+ "eval_loss": 1.7208112478256226,
1857
+ "eval_runtime": 2692.0576,
1858
+ "eval_samples_per_second": 62.389,
1859
+ "eval_steps_per_second": 7.799,
1860
+ "step": 99750
1861
+ },
1862
+ {
1863
+ "epoch": 38.1,
1864
+ "learning_rate": 0.0030952380952380953,
1865
+ "loss": 0.8028,
1866
+ "step": 100000
1867
+ },
1868
+ {
1869
+ "epoch": 38.29,
1870
+ "learning_rate": 0.0030857142857142854,
1871
+ "loss": 0.764,
1872
+ "step": 100500
1873
+ },
1874
+ {
1875
+ "epoch": 38.48,
1876
+ "learning_rate": 0.0030761904761904763,
1877
+ "loss": 0.7868,
1878
+ "step": 101000
1879
+ },
1880
+ {
1881
+ "epoch": 38.67,
1882
+ "learning_rate": 0.0030666666666666663,
1883
+ "loss": 0.8082,
1884
+ "step": 101500
1885
+ },
1886
+ {
1887
+ "epoch": 38.86,
1888
+ "learning_rate": 0.0030571428571428572,
1889
+ "loss": 0.8349,
1890
+ "step": 102000
1891
+ },
1892
+ {
1893
+ "epoch": 39.0,
1894
+ "eval_accuracy": 0.6679705873597095,
1895
+ "eval_loss": 1.7477226257324219,
1896
+ "eval_runtime": 2679.7183,
1897
+ "eval_samples_per_second": 62.676,
1898
+ "eval_steps_per_second": 7.835,
1899
+ "step": 102375
1900
+ },
1901
+ {
1902
+ "epoch": 39.05,
1903
+ "learning_rate": 0.003047619047619048,
1904
+ "loss": 0.8031,
1905
+ "step": 102500
1906
+ },
1907
+ {
1908
+ "epoch": 39.24,
1909
+ "learning_rate": 0.003038095238095238,
1910
+ "loss": 0.7409,
1911
+ "step": 103000
1912
+ },
1913
+ {
1914
+ "epoch": 39.43,
1915
+ "learning_rate": 0.003028571428571429,
1916
+ "loss": 0.7646,
1917
+ "step": 103500
1918
+ },
1919
+ {
1920
+ "epoch": 39.62,
1921
+ "learning_rate": 0.003019047619047619,
1922
+ "loss": 0.7781,
1923
+ "step": 104000
1924
+ },
1925
+ {
1926
+ "epoch": 39.81,
1927
+ "learning_rate": 0.0030095238095238096,
1928
+ "loss": 0.8094,
1929
+ "step": 104500
1930
+ },
1931
+ {
1932
+ "epoch": 40.0,
1933
+ "learning_rate": 0.003,
1934
+ "loss": 0.8159,
1935
+ "step": 105000
1936
+ },
1937
+ {
1938
+ "epoch": 40.0,
1939
+ "eval_accuracy": 0.6640588252805811,
1940
+ "eval_loss": 1.7669143676757812,
1941
+ "eval_runtime": 2681.934,
1942
+ "eval_samples_per_second": 62.625,
1943
+ "eval_steps_per_second": 7.828,
1944
+ "step": 105000
1945
+ },
1946
+ {
1947
+ "epoch": 40.19,
1948
+ "learning_rate": 0.0029904761904761905,
1949
+ "loss": 0.7129,
1950
+ "step": 105500
1951
+ },
1952
+ {
1953
+ "epoch": 40.38,
1954
+ "learning_rate": 0.002980952380952381,
1955
+ "loss": 0.7458,
1956
+ "step": 106000
1957
+ },
1958
+ {
1959
+ "epoch": 40.57,
1960
+ "learning_rate": 0.0029714285714285715,
1961
+ "loss": 0.7784,
1962
+ "step": 106500
1963
+ },
1964
+ {
1965
+ "epoch": 40.76,
1966
+ "learning_rate": 0.002961904761904762,
1967
+ "loss": 0.7782,
1968
+ "step": 107000
1969
+ },
1970
+ {
1971
+ "epoch": 40.95,
1972
+ "learning_rate": 0.0029523809523809524,
1973
+ "loss": 0.7894,
1974
+ "step": 107500
1975
+ },
1976
+ {
1977
+ "epoch": 41.0,
1978
+ "eval_accuracy": 0.669774642017207,
1979
+ "eval_loss": 1.79474675655365,
1980
+ "eval_runtime": 2697.7877,
1981
+ "eval_samples_per_second": 62.257,
1982
+ "eval_steps_per_second": 7.782,
1983
+ "step": 107625
1984
+ },
1985
+ {
1986
+ "epoch": 41.14,
1987
+ "learning_rate": 0.002942857142857143,
1988
+ "loss": 0.7193,
1989
+ "step": 108000
1990
+ },
1991
+ {
1992
+ "epoch": 41.33,
1993
+ "learning_rate": 0.0029333333333333334,
1994
+ "loss": 0.7162,
1995
+ "step": 108500
1996
+ },
1997
+ {
1998
+ "epoch": 41.52,
1999
+ "learning_rate": 0.0029238095238095243,
2000
+ "loss": 0.7387,
2001
+ "step": 109000
2002
+ },
2003
+ {
2004
+ "epoch": 41.71,
2005
+ "learning_rate": 0.0029142857142857143,
2006
+ "loss": 0.7592,
2007
+ "step": 109500
2008
+ },
2009
+ {
2010
+ "epoch": 41.9,
2011
+ "learning_rate": 0.002904761904761905,
2012
+ "loss": 0.765,
2013
+ "step": 110000
2014
+ },
2015
+ {
2016
+ "epoch": 42.0,
2017
+ "eval_accuracy": 0.6673335119526064,
2018
+ "eval_loss": 1.7770148515701294,
2019
+ "eval_runtime": 2711.4758,
2020
+ "eval_samples_per_second": 61.942,
2021
+ "eval_steps_per_second": 7.743,
2022
+ "step": 110250
2023
+ },
2024
+ {
2025
+ "epoch": 42.1,
2026
+ "learning_rate": 0.0028952380952380953,
2027
+ "loss": 0.7184,
2028
+ "step": 110500
2029
+ },
2030
+ {
2031
+ "epoch": 42.29,
2032
+ "learning_rate": 0.002885714285714286,
2033
+ "loss": 0.6956,
2034
+ "step": 111000
2035
+ },
2036
+ {
2037
+ "epoch": 42.48,
2038
+ "learning_rate": 0.002876190476190476,
2039
+ "loss": 0.713,
2040
+ "step": 111500
2041
+ },
2042
+ {
2043
+ "epoch": 42.67,
2044
+ "learning_rate": 0.0028666666666666667,
2045
+ "loss": 0.7328,
2046
+ "step": 112000
2047
+ },
2048
+ {
2049
+ "epoch": 42.86,
2050
+ "learning_rate": 0.002857142857142857,
2051
+ "loss": 0.7417,
2052
+ "step": 112500
2053
+ },
2054
+ {
2055
+ "epoch": 43.0,
2056
+ "eval_accuracy": 0.6685659849364413,
2057
+ "eval_loss": 1.8335988521575928,
2058
+ "eval_runtime": 4071.9751,
2059
+ "eval_samples_per_second": 41.247,
2060
+ "eval_steps_per_second": 5.156,
2061
+ "step": 112875
2062
+ },
2063
+ {
2064
+ "epoch": 43.05,
2065
+ "learning_rate": 0.0028476190476190476,
2066
+ "loss": 0.7272,
2067
+ "step": 113000
2068
+ },
2069
+ {
2070
+ "epoch": 43.24,
2071
+ "learning_rate": 0.002838095238095238,
2072
+ "loss": 0.6728,
2073
+ "step": 113500
2074
+ },
2075
+ {
2076
+ "epoch": 43.43,
2077
+ "learning_rate": 0.0028285714285714286,
2078
+ "loss": 0.6954,
2079
+ "step": 114000
2080
+ },
2081
+ {
2082
+ "epoch": 43.62,
2083
+ "learning_rate": 0.002819047619047619,
2084
+ "loss": 0.7025,
2085
+ "step": 114500
2086
+ },
2087
+ {
2088
+ "epoch": 43.81,
2089
+ "learning_rate": 0.0028095238095238095,
2090
+ "loss": 0.7208,
2091
+ "step": 115000
2092
+ },
2093
+ {
2094
+ "epoch": 44.0,
2095
+ "learning_rate": 0.0028000000000000004,
2096
+ "loss": 0.7214,
2097
+ "step": 115500
2098
+ },
2099
+ {
2100
+ "epoch": 44.0,
2101
+ "eval_accuracy": 0.6755142746569022,
2102
+ "eval_loss": 1.7522037029266357,
2103
+ "eval_runtime": 3540.85,
2104
+ "eval_samples_per_second": 47.434,
2105
+ "eval_steps_per_second": 5.929,
2106
+ "step": 115500
2107
+ },
2108
+ {
2109
+ "epoch": 44.19,
2110
+ "learning_rate": 0.0027904761904761904,
2111
+ "loss": 0.6433,
2112
+ "step": 116000
2113
+ },
2114
+ {
2115
+ "epoch": 44.38,
2116
+ "learning_rate": 0.0027809523809523813,
2117
+ "loss": 0.6634,
2118
+ "step": 116500
2119
+ },
2120
+ {
2121
+ "epoch": 44.57,
2122
+ "learning_rate": 0.0027714285714285714,
2123
+ "loss": 0.6854,
2124
+ "step": 117000
2125
+ },
2126
+ {
2127
+ "epoch": 44.76,
2128
+ "learning_rate": 0.0027619047619047623,
2129
+ "loss": 0.7026,
2130
+ "step": 117500
2131
+ },
2132
+ {
2133
+ "epoch": 44.95,
2134
+ "learning_rate": 0.0027523809523809523,
2135
+ "loss": 0.7113,
2136
+ "step": 118000
2137
+ },
2138
+ {
2139
+ "epoch": 45.0,
2140
+ "eval_accuracy": 0.6773778690720729,
2141
+ "eval_loss": 1.7851730585098267,
2142
+ "eval_runtime": 2781.7496,
2143
+ "eval_samples_per_second": 60.377,
2144
+ "eval_steps_per_second": 7.547,
2145
+ "step": 118125
2146
+ },
2147
+ {
2148
+ "epoch": 45.14,
2149
+ "learning_rate": 0.0027428571428571432,
2150
+ "loss": 0.6189,
2151
+ "step": 118500
2152
+ },
2153
+ {
2154
+ "epoch": 45.33,
2155
+ "learning_rate": 0.0027333333333333333,
2156
+ "loss": 0.6497,
2157
+ "step": 119000
2158
+ },
2159
+ {
2160
+ "epoch": 45.52,
2161
+ "learning_rate": 0.0027238095238095237,
2162
+ "loss": 0.666,
2163
+ "step": 119500
2164
+ },
2165
+ {
2166
+ "epoch": 45.71,
2167
+ "learning_rate": 0.0027142857142857142,
2168
+ "loss": 0.6832,
2169
+ "step": 120000
2170
+ },
2171
+ {
2172
+ "epoch": 45.9,
2173
+ "learning_rate": 0.0027047619047619047,
2174
+ "loss": 0.6954,
2175
+ "step": 120500
2176
+ },
2177
+ {
2178
+ "epoch": 46.0,
2179
+ "eval_accuracy": 0.6773719150963056,
2180
+ "eval_loss": 1.7557201385498047,
2181
+ "eval_runtime": 6130.4043,
2182
+ "eval_samples_per_second": 27.397,
2183
+ "eval_steps_per_second": 3.425,
2184
+ "step": 120750
2185
+ },
2186
+ {
2187
+ "epoch": 46.1,
2188
+ "learning_rate": 0.002695238095238095,
2189
+ "loss": 0.6582,
2190
+ "step": 121000
2191
+ },
2192
+ {
2193
+ "epoch": 46.29,
2194
+ "learning_rate": 0.0026857142857142856,
2195
+ "loss": 0.6174,
2196
+ "step": 121500
2197
+ },
2198
+ {
2199
+ "epoch": 46.48,
2200
+ "learning_rate": 0.0026761904761904765,
2201
+ "loss": 0.6356,
2202
+ "step": 122000
2203
+ },
2204
+ {
2205
+ "epoch": 46.67,
2206
+ "learning_rate": 0.0026666666666666666,
2207
+ "loss": 0.6565,
2208
+ "step": 122500
2209
+ },
2210
+ {
2211
+ "epoch": 46.86,
2212
+ "learning_rate": 0.0026571428571428575,
2213
+ "loss": 0.6658,
2214
+ "step": 123000
2215
+ },
2216
+ {
2217
+ "epoch": 47.0,
2218
+ "eval_accuracy": 0.678836593135066,
2219
+ "eval_loss": 1.8116456270217896,
2220
+ "eval_runtime": 5170.8601,
2221
+ "eval_samples_per_second": 32.481,
2222
+ "eval_steps_per_second": 4.06,
2223
+ "step": 123375
2224
+ },
2225
+ {
2226
+ "epoch": 47.05,
2227
+ "learning_rate": 0.0026476190476190475,
2228
+ "loss": 0.6552,
2229
+ "step": 123500
2230
+ },
2231
+ {
2232
+ "epoch": 47.24,
2233
+ "learning_rate": 0.0026380952380952384,
2234
+ "loss": 0.5994,
2235
+ "step": 124000
2236
+ },
2237
+ {
2238
+ "epoch": 47.43,
2239
+ "learning_rate": 0.0026285714285714285,
2240
+ "loss": 0.6221,
2241
+ "step": 124500
2242
+ },
2243
+ {
2244
+ "epoch": 47.62,
2245
+ "learning_rate": 0.0026190476190476194,
2246
+ "loss": 0.635,
2247
+ "step": 125000
2248
+ },
2249
+ {
2250
+ "epoch": 47.81,
2251
+ "learning_rate": 0.0026095238095238094,
2252
+ "loss": 0.648,
2253
+ "step": 125500
2254
+ },
2255
+ {
2256
+ "epoch": 48.0,
2257
+ "learning_rate": 0.0026000000000000003,
2258
+ "loss": 0.6593,
2259
+ "step": 126000
2260
+ },
2261
+ {
2262
+ "epoch": 48.0,
2263
+ "eval_accuracy": 0.6829388824387485,
2264
+ "eval_loss": 1.8153612613677979,
2265
+ "eval_runtime": 3984.2658,
2266
+ "eval_samples_per_second": 42.155,
2267
+ "eval_steps_per_second": 5.269,
2268
+ "step": 126000
2269
+ },
2270
+ {
2271
+ "epoch": 48.19,
2272
+ "learning_rate": 0.0025904761904761904,
2273
+ "loss": 0.58,
2274
+ "step": 126500
2275
+ },
2276
+ {
2277
+ "epoch": 48.38,
2278
+ "learning_rate": 0.0025809523809523813,
2279
+ "loss": 0.5966,
2280
+ "step": 127000
2281
+ },
2282
+ {
2283
+ "epoch": 48.57,
2284
+ "learning_rate": 0.0025714285714285713,
2285
+ "loss": 0.6184,
2286
+ "step": 127500
2287
+ },
2288
+ {
2289
+ "epoch": 48.76,
2290
+ "learning_rate": 0.0025619047619047618,
2291
+ "loss": 0.6257,
2292
+ "step": 128000
2293
+ },
2294
+ {
2295
+ "epoch": 48.95,
2296
+ "learning_rate": 0.0025523809523809527,
2297
+ "loss": 0.6384,
2298
+ "step": 128500
2299
+ },
2300
+ {
2301
+ "epoch": 49.0,
2302
+ "eval_accuracy": 0.6795213003483076,
2303
+ "eval_loss": 1.7874691486358643,
2304
+ "eval_runtime": 3000.761,
2305
+ "eval_samples_per_second": 55.971,
2306
+ "eval_steps_per_second": 6.997,
2307
+ "step": 128625
2308
+ },
2309
+ {
2310
+ "epoch": 49.14,
2311
+ "learning_rate": 0.0025428571428571427,
2312
+ "loss": 0.5834,
2313
+ "step": 129000
2314
+ },
2315
+ {
2316
+ "epoch": 49.33,
2317
+ "learning_rate": 0.0025333333333333336,
2318
+ "loss": 0.5847,
2319
+ "step": 129500
2320
+ },
2321
+ {
2322
+ "epoch": 49.52,
2323
+ "learning_rate": 0.0025238095238095237,
2324
+ "loss": 0.5981,
2325
+ "step": 130000
2326
+ },
2327
+ {
2328
+ "epoch": 49.71,
2329
+ "learning_rate": 0.0025142857142857146,
2330
+ "loss": 0.6141,
2331
+ "step": 130500
2332
+ },
2333
+ {
2334
+ "epoch": 49.9,
2335
+ "learning_rate": 0.0025047619047619046,
2336
+ "loss": 0.6257,
2337
+ "step": 131000
2338
+ },
2339
+ {
2340
+ "epoch": 50.0,
2341
+ "eval_accuracy": 0.6811050579024144,
2342
+ "eval_loss": 1.882097840309143,
2343
+ "eval_runtime": 2757.9073,
2344
+ "eval_samples_per_second": 60.899,
2345
+ "eval_steps_per_second": 7.613,
2346
+ "step": 131250
2347
+ },
2348
+ {
2349
+ "epoch": 50.1,
2350
+ "learning_rate": 0.0024952380952380955,
2351
+ "loss": 0.5799,
2352
+ "step": 131500
2353
+ },
2354
+ {
2355
+ "epoch": 50.29,
2356
+ "learning_rate": 0.002485714285714286,
2357
+ "loss": 0.5715,
2358
+ "step": 132000
2359
+ },
2360
+ {
2361
+ "epoch": 50.48,
2362
+ "learning_rate": 0.0024761904761904764,
2363
+ "loss": 0.5847,
2364
+ "step": 132500
2365
+ },
2366
+ {
2367
+ "epoch": 50.67,
2368
+ "learning_rate": 0.002466666666666667,
2369
+ "loss": 0.5942,
2370
+ "step": 133000
2371
+ },
2372
+ {
2373
+ "epoch": 50.86,
2374
+ "learning_rate": 0.0024571428571428574,
2375
+ "loss": 0.5999,
2376
+ "step": 133500
2377
+ },
2378
+ {
2379
+ "epoch": 51.0,
2380
+ "eval_accuracy": 0.6871840671608467,
2381
+ "eval_loss": 1.8406310081481934,
2382
+ "eval_runtime": 2763.7082,
2383
+ "eval_samples_per_second": 60.772,
2384
+ "eval_steps_per_second": 7.597,
2385
+ "step": 133875
2386
+ },
2387
+ {
2388
+ "epoch": 51.05,
2389
+ "learning_rate": 0.002447619047619048,
2390
+ "loss": 0.5872,
2391
+ "step": 134000
2392
+ },
2393
+ {
2394
+ "epoch": 51.24,
2395
+ "learning_rate": 0.0024380952380952383,
2396
+ "loss": 0.5511,
2397
+ "step": 134500
2398
+ },
2399
+ {
2400
+ "epoch": 51.43,
2401
+ "learning_rate": 0.002428571428571429,
2402
+ "loss": 0.5592,
2403
+ "step": 135000
2404
+ },
2405
+ {
2406
+ "epoch": 51.62,
2407
+ "learning_rate": 0.002419047619047619,
2408
+ "loss": 0.5658,
2409
+ "step": 135500
2410
+ },
2411
+ {
2412
+ "epoch": 51.81,
2413
+ "learning_rate": 0.0024095238095238093,
2414
+ "loss": 0.5887,
2415
+ "step": 136000
2416
+ },
2417
+ {
2418
+ "epoch": 52.0,
2419
+ "learning_rate": 0.0024,
2420
+ "loss": 0.5924,
2421
+ "step": 136500
2422
+ },
2423
+ {
2424
+ "epoch": 52.0,
2425
+ "eval_accuracy": 0.6764728647554404,
2426
+ "eval_loss": 1.9697257280349731,
2427
+ "eval_runtime": 2760.1617,
2428
+ "eval_samples_per_second": 60.85,
2429
+ "eval_steps_per_second": 7.606,
2430
+ "step": 136500
2431
+ },
2432
+ {
2433
+ "epoch": 52.19,
2434
+ "learning_rate": 0.0023904761904761903,
2435
+ "loss": 0.5313,
2436
+ "step": 137000
2437
+ },
2438
+ {
2439
+ "epoch": 52.38,
2440
+ "learning_rate": 0.0023809523809523807,
2441
+ "loss": 0.5451,
2442
+ "step": 137500
2443
+ },
2444
+ {
2445
+ "epoch": 52.57,
2446
+ "learning_rate": 0.0023714285714285716,
2447
+ "loss": 0.56,
2448
+ "step": 138000
2449
+ },
2450
+ {
2451
+ "epoch": 52.76,
2452
+ "learning_rate": 0.002361904761904762,
2453
+ "loss": 0.5676,
2454
+ "step": 138500
2455
+ },
2456
+ {
2457
+ "epoch": 52.95,
2458
+ "learning_rate": 0.0023523809523809526,
2459
+ "loss": 0.5812,
2460
+ "step": 139000
2461
+ },
2462
+ {
2463
+ "epoch": 53.0,
2464
+ "eval_accuracy": 0.6818195349944925,
2465
+ "eval_loss": 1.9344266653060913,
2466
+ "eval_runtime": 2852.1671,
2467
+ "eval_samples_per_second": 58.887,
2468
+ "eval_steps_per_second": 7.361,
2469
+ "step": 139125
2470
+ },
2471
+ {
2472
+ "epoch": 53.14,
2473
+ "learning_rate": 0.002342857142857143,
2474
+ "loss": 0.506,
2475
+ "step": 139500
2476
+ },
2477
+ {
2478
+ "epoch": 53.33,
2479
+ "learning_rate": 0.0023333333333333335,
2480
+ "loss": 0.541,
2481
+ "step": 140000
2482
+ },
2483
+ {
2484
+ "epoch": 53.52,
2485
+ "learning_rate": 0.002323809523809524,
2486
+ "loss": 0.5445,
2487
+ "step": 140500
2488
+ },
2489
+ {
2490
+ "epoch": 53.71,
2491
+ "learning_rate": 0.0023142857142857145,
2492
+ "loss": 0.5547,
2493
+ "step": 141000
2494
+ },
2495
+ {
2496
+ "epoch": 53.9,
2497
+ "learning_rate": 0.002304761904761905,
2498
+ "loss": 0.5521,
2499
+ "step": 141500
2500
+ },
2501
+ {
2502
+ "epoch": 54.0,
2503
+ "eval_accuracy": 0.6802060075615493,
2504
+ "eval_loss": 1.9845408201217651,
2505
+ "eval_runtime": 3383.9959,
2506
+ "eval_samples_per_second": 49.632,
2507
+ "eval_steps_per_second": 6.204,
2508
+ "step": 141750
2509
+ },
2510
+ {
2511
+ "epoch": 54.1,
2512
+ "learning_rate": 0.0022952380952380954,
2513
+ "loss": 0.5328,
2514
+ "step": 142000
2515
+ },
2516
+ {
2517
+ "epoch": 54.29,
2518
+ "learning_rate": 0.002285714285714286,
2519
+ "loss": 0.5141,
2520
+ "step": 142500
2521
+ },
2522
+ {
2523
+ "epoch": 54.48,
2524
+ "learning_rate": 0.0022761904761904764,
2525
+ "loss": 0.5266,
2526
+ "step": 143000
2527
+ },
2528
+ {
2529
+ "epoch": 54.67,
2530
+ "learning_rate": 0.0022666666666666664,
2531
+ "loss": 0.5354,
2532
+ "step": 143500
2533
+ },
2534
+ {
2535
+ "epoch": 54.86,
2536
+ "learning_rate": 0.002257142857142857,
2537
+ "loss": 0.556,
2538
+ "step": 144000
2539
+ },
2540
+ {
2541
+ "epoch": 55.0,
2542
+ "eval_accuracy": 0.6826173677473133,
2543
+ "eval_loss": 2.0039005279541016,
2544
+ "eval_runtime": 2747.9149,
2545
+ "eval_samples_per_second": 61.121,
2546
+ "eval_steps_per_second": 7.64,
2547
+ "step": 144375
2548
+ },
2549
+ {
2550
+ "epoch": 55.05,
2551
+ "learning_rate": 0.0022476190476190478,
2552
+ "loss": 0.5317,
2553
+ "step": 144500
2554
+ },
2555
+ {
2556
+ "epoch": 55.24,
2557
+ "learning_rate": 0.0022380952380952382,
2558
+ "loss": 0.4956,
2559
+ "step": 145000
2560
+ },
2561
+ {
2562
+ "epoch": 55.43,
2563
+ "learning_rate": 0.0022285714285714287,
2564
+ "loss": 0.512,
2565
+ "step": 145500
2566
+ },
2567
+ {
2568
+ "epoch": 55.62,
2569
+ "learning_rate": 0.002219047619047619,
2570
+ "loss": 0.5243,
2571
+ "step": 146000
2572
+ },
2573
+ {
2574
+ "epoch": 55.81,
2575
+ "learning_rate": 0.0022095238095238097,
2576
+ "loss": 0.5283,
2577
+ "step": 146500
2578
+ },
2579
+ {
2580
+ "epoch": 56.0,
2581
+ "learning_rate": 0.0022,
2582
+ "loss": 0.5412,
2583
+ "step": 147000
2584
+ },
2585
+ {
2586
+ "epoch": 56.0,
2587
+ "eval_accuracy": 0.6856896192432497,
2588
+ "eval_loss": 1.9339253902435303,
2589
+ "eval_runtime": 3227.3643,
2590
+ "eval_samples_per_second": 52.041,
2591
+ "eval_steps_per_second": 6.505,
2592
+ "step": 147000
2593
+ },
2594
+ {
2595
+ "epoch": 56.19,
2596
+ "learning_rate": 0.0021904761904761906,
2597
+ "loss": 0.4902,
2598
+ "step": 147500
2599
+ },
2600
+ {
2601
+ "epoch": 56.38,
2602
+ "learning_rate": 0.002180952380952381,
2603
+ "loss": 0.4989,
2604
+ "step": 148000
2605
+ },
2606
+ {
2607
+ "epoch": 56.57,
2608
+ "learning_rate": 0.0021714285714285715,
2609
+ "loss": 0.4979,
2610
+ "step": 148500
2611
+ },
2612
+ {
2613
+ "epoch": 56.76,
2614
+ "learning_rate": 0.002161904761904762,
2615
+ "loss": 0.5194,
2616
+ "step": 149000
2617
+ },
2618
+ {
2619
+ "epoch": 56.95,
2620
+ "learning_rate": 0.0021523809523809525,
2621
+ "loss": 0.5204,
2622
+ "step": 149500
2623
+ },
2624
+ {
2625
+ "epoch": 57.0,
2626
+ "eval_accuracy": 0.6872436069185198,
2627
+ "eval_loss": 2.0444183349609375,
2628
+ "eval_runtime": 2731.7184,
2629
+ "eval_samples_per_second": 61.483,
2630
+ "eval_steps_per_second": 7.686,
2631
+ "step": 149625
2632
+ },
2633
+ {
2634
+ "epoch": 57.14,
2635
+ "learning_rate": 0.002142857142857143,
2636
+ "loss": 0.4842,
2637
+ "step": 150000
2638
+ },
2639
+ {
2640
+ "epoch": 57.33,
2641
+ "learning_rate": 0.0021333333333333334,
2642
+ "loss": 0.4815,
2643
+ "step": 150500
2644
+ },
2645
+ {
2646
+ "epoch": 57.52,
2647
+ "learning_rate": 0.002123809523809524,
2648
+ "loss": 0.4911,
2649
+ "step": 151000
2650
+ },
2651
+ {
2652
+ "epoch": 57.71,
2653
+ "learning_rate": 0.0021142857142857144,
2654
+ "loss": 0.4954,
2655
+ "step": 151500
2656
+ },
2657
+ {
2658
+ "epoch": 57.9,
2659
+ "learning_rate": 0.002104761904761905,
2660
+ "loss": 0.5051,
2661
+ "step": 152000
2662
+ },
2663
+ {
2664
+ "epoch": 58.0,
2665
+ "eval_accuracy": 0.689601381322378,
2666
+ "eval_loss": 1.9677560329437256,
2667
+ "eval_runtime": 2701.9047,
2668
+ "eval_samples_per_second": 62.162,
2669
+ "eval_steps_per_second": 7.77,
2670
+ "step": 152250
2671
+ },
2672
+ {
2673
+ "epoch": 58.1,
2674
+ "learning_rate": 0.0020952380952380953,
2675
+ "loss": 0.4901,
2676
+ "step": 152500
2677
+ },
2678
+ {
2679
+ "epoch": 58.29,
2680
+ "learning_rate": 0.002085714285714286,
2681
+ "loss": 0.4763,
2682
+ "step": 153000
2683
+ },
2684
+ {
2685
+ "epoch": 58.48,
2686
+ "learning_rate": 0.0020761904761904763,
2687
+ "loss": 0.4806,
2688
+ "step": 153500
2689
+ },
2690
+ {
2691
+ "epoch": 58.67,
2692
+ "learning_rate": 0.0020666666666666667,
2693
+ "loss": 0.4858,
2694
+ "step": 154000
2695
+ },
2696
+ {
2697
+ "epoch": 58.86,
2698
+ "learning_rate": 0.002057142857142857,
2699
+ "loss": 0.4977,
2700
+ "step": 154500
2701
+ },
2702
+ {
2703
+ "epoch": 59.0,
2704
+ "eval_accuracy": 0.6877735107618111,
2705
+ "eval_loss": 2.0384891033172607,
2706
+ "eval_runtime": 2728.0396,
2707
+ "eval_samples_per_second": 61.566,
2708
+ "eval_steps_per_second": 7.696,
2709
+ "step": 154875
2710
+ },
2711
+ {
2712
+ "epoch": 59.05,
2713
+ "learning_rate": 0.0020476190476190477,
2714
+ "loss": 0.483,
2715
+ "step": 155000
2716
+ },
2717
+ {
2718
+ "epoch": 59.24,
2719
+ "learning_rate": 0.002038095238095238,
2720
+ "loss": 0.4575,
2721
+ "step": 155500
2722
+ },
2723
+ {
2724
+ "epoch": 59.43,
2725
+ "learning_rate": 0.0020285714285714286,
2726
+ "loss": 0.4705,
2727
+ "step": 156000
2728
+ },
2729
+ {
2730
+ "epoch": 59.62,
2731
+ "learning_rate": 0.002019047619047619,
2732
+ "loss": 0.4846,
2733
+ "step": 156500
2734
+ },
2735
+ {
2736
+ "epoch": 59.81,
2737
+ "learning_rate": 0.0020095238095238096,
2738
+ "loss": 0.4872,
2739
+ "step": 157000
2740
+ },
2741
+ {
2742
+ "epoch": 60.0,
2743
+ "learning_rate": 0.002,
2744
+ "loss": 0.4932,
2745
+ "step": 157500
2746
+ },
2747
+ {
2748
+ "epoch": 60.0,
2749
+ "eval_accuracy": 0.6890178916971808,
2750
+ "eval_loss": 2.05938720703125,
2751
+ "eval_runtime": 2847.0911,
2752
+ "eval_samples_per_second": 58.992,
2753
+ "eval_steps_per_second": 7.374,
2754
+ "step": 157500
2755
+ },
2756
+ {
2757
+ "epoch": 60.19,
2758
+ "learning_rate": 0.0019904761904761905,
2759
+ "loss": 0.4454,
2760
+ "step": 158000
2761
+ },
2762
+ {
2763
+ "epoch": 60.38,
2764
+ "learning_rate": 0.001980952380952381,
2765
+ "loss": 0.4559,
2766
+ "step": 158500
2767
+ },
2768
+ {
2769
+ "epoch": 60.57,
2770
+ "learning_rate": 0.0019714285714285715,
2771
+ "loss": 0.46,
2772
+ "step": 159000
2773
+ },
2774
+ {
2775
+ "epoch": 60.76,
2776
+ "learning_rate": 0.001961904761904762,
2777
+ "loss": 0.472,
2778
+ "step": 159500
2779
+ },
2780
+ {
2781
+ "epoch": 60.95,
2782
+ "learning_rate": 0.0019523809523809524,
2783
+ "loss": 0.4689,
2784
+ "step": 160000
2785
+ },
2786
+ {
2787
+ "epoch": 61.0,
2788
+ "eval_accuracy": 0.6834806942335745,
2789
+ "eval_loss": 2.1333200931549072,
2790
+ "eval_runtime": 3489.1293,
2791
+ "eval_samples_per_second": 48.137,
2792
+ "eval_steps_per_second": 6.017,
2793
+ "step": 160125
2794
+ },
2795
+ {
2796
+ "epoch": 61.14,
2797
+ "learning_rate": 0.0019428571428571429,
2798
+ "loss": 0.4402,
2799
+ "step": 160500
2800
+ },
2801
+ {
2802
+ "epoch": 61.33,
2803
+ "learning_rate": 0.0019333333333333333,
2804
+ "loss": 0.445,
2805
+ "step": 161000
2806
+ },
2807
+ {
2808
+ "epoch": 61.52,
2809
+ "learning_rate": 0.0019238095238095238,
2810
+ "loss": 0.4504,
2811
+ "step": 161500
2812
+ },
2813
+ {
2814
+ "epoch": 61.71,
2815
+ "learning_rate": 0.0019142857142857143,
2816
+ "loss": 0.4577,
2817
+ "step": 162000
2818
+ },
2819
+ {
2820
+ "epoch": 61.9,
2821
+ "learning_rate": 0.0019047619047619048,
2822
+ "loss": 0.4652,
2823
+ "step": 162500
2824
+ },
2825
+ {
2826
+ "epoch": 62.0,
2827
+ "eval_accuracy": 0.6899228960138132,
2828
+ "eval_loss": 2.1436057090759277,
2829
+ "eval_runtime": 2832.828,
2830
+ "eval_samples_per_second": 59.289,
2831
+ "eval_steps_per_second": 7.411,
2832
+ "step": 162750
2833
+ },
2834
+ {
2835
+ "epoch": 62.1,
2836
+ "learning_rate": 0.0018952380952380952,
2837
+ "loss": 0.4362,
2838
+ "step": 163000
2839
+ },
2840
+ {
2841
+ "epoch": 62.29,
2842
+ "learning_rate": 0.001885714285714286,
2843
+ "loss": 0.4351,
2844
+ "step": 163500
2845
+ },
2846
+ {
2847
+ "epoch": 62.48,
2848
+ "learning_rate": 0.0018761904761904764,
2849
+ "loss": 0.4404,
2850
+ "step": 164000
2851
+ },
2852
+ {
2853
+ "epoch": 62.67,
2854
+ "learning_rate": 0.0018666666666666669,
2855
+ "loss": 0.4443,
2856
+ "step": 164500
2857
+ },
2858
+ {
2859
+ "epoch": 62.86,
2860
+ "learning_rate": 0.0018571428571428573,
2861
+ "loss": 0.4515,
2862
+ "step": 165000
2863
+ },
2864
+ {
2865
+ "epoch": 63.0,
2866
+ "eval_accuracy": 0.6923283022238099,
2867
+ "eval_loss": 2.1558289527893066,
2868
+ "eval_runtime": 2793.0236,
2869
+ "eval_samples_per_second": 60.134,
2870
+ "eval_steps_per_second": 7.517,
2871
+ "step": 165375
2872
+ },
2873
+ {
2874
+ "epoch": 63.05,
2875
+ "learning_rate": 0.0018476190476190476,
2876
+ "loss": 0.4448,
2877
+ "step": 165500
2878
+ },
2879
+ {
2880
+ "epoch": 63.24,
2881
+ "learning_rate": 0.001838095238095238,
2882
+ "loss": 0.4217,
2883
+ "step": 166000
2884
+ },
2885
+ {
2886
+ "epoch": 63.43,
2887
+ "learning_rate": 0.0018285714285714285,
2888
+ "loss": 0.4293,
2889
+ "step": 166500
2890
+ },
2891
+ {
2892
+ "epoch": 63.62,
2893
+ "learning_rate": 0.001819047619047619,
2894
+ "loss": 0.4295,
2895
+ "step": 167000
2896
+ },
2897
+ {
2898
+ "epoch": 63.81,
2899
+ "learning_rate": 0.0018095238095238095,
2900
+ "loss": 0.4383,
2901
+ "step": 167500
2902
+ },
2903
+ {
2904
+ "epoch": 64.0,
2905
+ "learning_rate": 0.0018,
2906
+ "loss": 0.4542,
2907
+ "step": 168000
2908
+ },
2909
+ {
2910
+ "epoch": 64.0,
2911
+ "eval_accuracy": 0.6934059718376946,
2912
+ "eval_loss": 2.0637614727020264,
2913
+ "eval_runtime": 2998.8188,
2914
+ "eval_samples_per_second": 56.007,
2915
+ "eval_steps_per_second": 7.001,
2916
+ "step": 168000
2917
+ },
2918
+ {
2919
+ "epoch": 64.0,
2920
+ "step": 168000,
2921
+ "total_flos": 4.5354579076109147e+21,
2922
+ "train_loss": 0.0,
2923
+ "train_runtime": 758.3182,
2924
+ "train_samples_per_second": 53155.811,
2925
+ "train_steps_per_second": 207.696
2926
  }
2927
  ],
2928
  "logging_steps": 500,
2929
+ "max_steps": 157500,
2930
+ "num_train_epochs": 60,
2931
  "save_steps": 500,
2932
+ "total_flos": 4.5354579076109147e+21,
2933
  "trial_name": null,
2934
  "trial_params": null
2935
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:442f416aa0b965f8dcbd5002c930ed275aed8271e803557955df8397cd09a84a
3
  size 4792
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a25946f7825b00ddb42330dbeeb0f0a0fa0970c7672d1e70207329539d0944b1
3
  size 4792