CrabInHoney commited on
Commit
2a330d3
·
verified ·
1 Parent(s): 837ccb1

Upload 11 files

Browse files
added_tokens.json ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<extra_id_0>": 2160,
3
+ "<extra_id_10>": 2150,
4
+ "<extra_id_11>": 2149,
5
+ "<extra_id_12>": 2148,
6
+ "<extra_id_13>": 2147,
7
+ "<extra_id_14>": 2146,
8
+ "<extra_id_15>": 2145,
9
+ "<extra_id_16>": 2144,
10
+ "<extra_id_17>": 2143,
11
+ "<extra_id_18>": 2142,
12
+ "<extra_id_19>": 2141,
13
+ "<extra_id_1>": 2159,
14
+ "<extra_id_20>": 2140,
15
+ "<extra_id_21>": 2139,
16
+ "<extra_id_22>": 2138,
17
+ "<extra_id_23>": 2137,
18
+ "<extra_id_24>": 2136,
19
+ "<extra_id_25>": 2135,
20
+ "<extra_id_26>": 2134,
21
+ "<extra_id_27>": 2133,
22
+ "<extra_id_28>": 2132,
23
+ "<extra_id_29>": 2131,
24
+ "<extra_id_2>": 2158,
25
+ "<extra_id_30>": 2130,
26
+ "<extra_id_31>": 2129,
27
+ "<extra_id_32>": 2128,
28
+ "<extra_id_33>": 2127,
29
+ "<extra_id_34>": 2126,
30
+ "<extra_id_35>": 2125,
31
+ "<extra_id_36>": 2124,
32
+ "<extra_id_37>": 2123,
33
+ "<extra_id_38>": 2122,
34
+ "<extra_id_39>": 2121,
35
+ "<extra_id_3>": 2157,
36
+ "<extra_id_40>": 2120,
37
+ "<extra_id_41>": 2119,
38
+ "<extra_id_42>": 2118,
39
+ "<extra_id_43>": 2117,
40
+ "<extra_id_44>": 2116,
41
+ "<extra_id_45>": 2115,
42
+ "<extra_id_46>": 2114,
43
+ "<extra_id_47>": 2113,
44
+ "<extra_id_48>": 2112,
45
+ "<extra_id_49>": 2111,
46
+ "<extra_id_4>": 2156,
47
+ "<extra_id_50>": 2110,
48
+ "<extra_id_51>": 2109,
49
+ "<extra_id_52>": 2108,
50
+ "<extra_id_53>": 2107,
51
+ "<extra_id_54>": 2106,
52
+ "<extra_id_55>": 2105,
53
+ "<extra_id_56>": 2104,
54
+ "<extra_id_57>": 2103,
55
+ "<extra_id_58>": 2102,
56
+ "<extra_id_59>": 2101,
57
+ "<extra_id_5>": 2155,
58
+ "<extra_id_60>": 2100,
59
+ "<extra_id_61>": 2099,
60
+ "<extra_id_62>": 2098,
61
+ "<extra_id_63>": 2097,
62
+ "<extra_id_64>": 2096,
63
+ "<extra_id_65>": 2095,
64
+ "<extra_id_66>": 2094,
65
+ "<extra_id_67>": 2093,
66
+ "<extra_id_68>": 2092,
67
+ "<extra_id_69>": 2091,
68
+ "<extra_id_6>": 2154,
69
+ "<extra_id_70>": 2090,
70
+ "<extra_id_71>": 2089,
71
+ "<extra_id_72>": 2088,
72
+ "<extra_id_73>": 2087,
73
+ "<extra_id_74>": 2086,
74
+ "<extra_id_75>": 2085,
75
+ "<extra_id_76>": 2084,
76
+ "<extra_id_77>": 2083,
77
+ "<extra_id_78>": 2082,
78
+ "<extra_id_79>": 2081,
79
+ "<extra_id_7>": 2153,
80
+ "<extra_id_80>": 2080,
81
+ "<extra_id_81>": 2079,
82
+ "<extra_id_82>": 2078,
83
+ "<extra_id_83>": 2077,
84
+ "<extra_id_84>": 2076,
85
+ "<extra_id_85>": 2075,
86
+ "<extra_id_86>": 2074,
87
+ "<extra_id_87>": 2073,
88
+ "<extra_id_88>": 2072,
89
+ "<extra_id_89>": 2071,
90
+ "<extra_id_8>": 2152,
91
+ "<extra_id_90>": 2070,
92
+ "<extra_id_91>": 2069,
93
+ "<extra_id_92>": 2068,
94
+ "<extra_id_93>": 2067,
95
+ "<extra_id_94>": 2066,
96
+ "<extra_id_95>": 2065,
97
+ "<extra_id_96>": 2064,
98
+ "<extra_id_97>": 2063,
99
+ "<extra_id_98>": 2062,
100
+ "<extra_id_99>": 2061,
101
+ "<extra_id_9>": 2151
102
+ }
config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "checkpoint-79000-TEST",
3
+ "architectures": [
4
+ "T5ForConditionalGeneration"
5
+ ],
6
+ "classifier_dropout": 0.0,
7
+ "d_ff": 2048,
8
+ "d_kv": 64,
9
+ "d_model": 512,
10
+ "decoder_start_token_id": 1,
11
+ "dense_act_fn": "gelu",
12
+ "dropout_rate": 0.1,
13
+ "eos_token_id": 2,
14
+ "feed_forward_proj": "gelu",
15
+ "initializer_factor": 1.0,
16
+ "is_encoder_decoder": true,
17
+ "is_gated_act": false,
18
+ "layer_norm_epsilon": 1e-06,
19
+ "max_position_embeddings": 1024,
20
+ "model_type": "t5",
21
+ "num_decoder_layers": 8,
22
+ "num_heads": 8,
23
+ "num_layers": 8,
24
+ "pad_token_id": 1,
25
+ "relative_attention_max_distance": 128,
26
+ "relative_attention_num_buckets": 32,
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.44.2",
29
+ "use_cache": true,
30
+ "vocab_size": 2061
31
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "decoder_start_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 1,
6
+ "transformers_version": "4.44.2"
7
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:379b9bee2c4a1e7761997d383dc6a57df9e329736c86ec865d6eaf07560d4ca9
3
+ size 239210400
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5769e0bdbebfbb9183634a9795d1e3f5e73c6a695dca6216e221b199cd936af
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:735fa9629ce6be070d4a9c986bb1e4caa9de1d735dbcf038f1406f75e5dbf3fc
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<extra_id_0>",
4
+ "<extra_id_1>",
5
+ "<extra_id_2>",
6
+ "<extra_id_3>",
7
+ "<extra_id_4>",
8
+ "<extra_id_5>",
9
+ "<extra_id_6>",
10
+ "<extra_id_7>",
11
+ "<extra_id_8>",
12
+ "<extra_id_9>",
13
+ "<extra_id_10>",
14
+ "<extra_id_11>",
15
+ "<extra_id_12>",
16
+ "<extra_id_13>",
17
+ "<extra_id_14>",
18
+ "<extra_id_15>",
19
+ "<extra_id_16>",
20
+ "<extra_id_17>",
21
+ "<extra_id_18>",
22
+ "<extra_id_19>",
23
+ "<extra_id_20>",
24
+ "<extra_id_21>",
25
+ "<extra_id_22>",
26
+ "<extra_id_23>",
27
+ "<extra_id_24>",
28
+ "<extra_id_25>",
29
+ "<extra_id_26>",
30
+ "<extra_id_27>",
31
+ "<extra_id_28>",
32
+ "<extra_id_29>",
33
+ "<extra_id_30>",
34
+ "<extra_id_31>",
35
+ "<extra_id_32>",
36
+ "<extra_id_33>",
37
+ "<extra_id_34>",
38
+ "<extra_id_35>",
39
+ "<extra_id_36>",
40
+ "<extra_id_37>",
41
+ "<extra_id_38>",
42
+ "<extra_id_39>",
43
+ "<extra_id_40>",
44
+ "<extra_id_41>",
45
+ "<extra_id_42>",
46
+ "<extra_id_43>",
47
+ "<extra_id_44>",
48
+ "<extra_id_45>",
49
+ "<extra_id_46>",
50
+ "<extra_id_47>",
51
+ "<extra_id_48>",
52
+ "<extra_id_49>",
53
+ "<extra_id_50>",
54
+ "<extra_id_51>",
55
+ "<extra_id_52>",
56
+ "<extra_id_53>",
57
+ "<extra_id_54>",
58
+ "<extra_id_55>",
59
+ "<extra_id_56>",
60
+ "<extra_id_57>",
61
+ "<extra_id_58>",
62
+ "<extra_id_59>",
63
+ "<extra_id_60>",
64
+ "<extra_id_61>",
65
+ "<extra_id_62>",
66
+ "<extra_id_63>",
67
+ "<extra_id_64>",
68
+ "<extra_id_65>",
69
+ "<extra_id_66>",
70
+ "<extra_id_67>",
71
+ "<extra_id_68>",
72
+ "<extra_id_69>",
73
+ "<extra_id_70>",
74
+ "<extra_id_71>",
75
+ "<extra_id_72>",
76
+ "<extra_id_73>",
77
+ "<extra_id_74>",
78
+ "<extra_id_75>",
79
+ "<extra_id_76>",
80
+ "<extra_id_77>",
81
+ "<extra_id_78>",
82
+ "<extra_id_79>",
83
+ "<extra_id_80>",
84
+ "<extra_id_81>",
85
+ "<extra_id_82>",
86
+ "<extra_id_83>",
87
+ "<extra_id_84>",
88
+ "<extra_id_85>",
89
+ "<extra_id_86>",
90
+ "<extra_id_87>",
91
+ "<extra_id_88>",
92
+ "<extra_id_89>",
93
+ "<extra_id_90>",
94
+ "<extra_id_91>",
95
+ "<extra_id_92>",
96
+ "<extra_id_93>",
97
+ "<extra_id_94>",
98
+ "<extra_id_95>",
99
+ "<extra_id_96>",
100
+ "<extra_id_97>",
101
+ "<extra_id_98>",
102
+ "<extra_id_99>"
103
+ ],
104
+ "eos_token": {
105
+ "content": "</s>",
106
+ "lstrip": false,
107
+ "normalized": false,
108
+ "rstrip": false,
109
+ "single_word": false
110
+ },
111
+ "pad_token": {
112
+ "content": "<pad>",
113
+ "lstrip": false,
114
+ "normalized": false,
115
+ "rstrip": false,
116
+ "single_word": false
117
+ },
118
+ "unk_token": {
119
+ "content": "<unk>",
120
+ "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false
124
+ }
125
+ }
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7ca7001715be416b315945f1d1b95b76255795719723a9ad345913a76f07130
3
+ size 269621
tokenizer_config.json ADDED
@@ -0,0 +1,940 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": true,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<unk>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "2061": {
29
+ "content": "<extra_id_99>",
30
+ "lstrip": true,
31
+ "normalized": false,
32
+ "rstrip": true,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "2062": {
37
+ "content": "<extra_id_98>",
38
+ "lstrip": true,
39
+ "normalized": false,
40
+ "rstrip": true,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "2063": {
45
+ "content": "<extra_id_97>",
46
+ "lstrip": true,
47
+ "normalized": false,
48
+ "rstrip": true,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "2064": {
53
+ "content": "<extra_id_96>",
54
+ "lstrip": true,
55
+ "normalized": false,
56
+ "rstrip": true,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "2065": {
61
+ "content": "<extra_id_95>",
62
+ "lstrip": true,
63
+ "normalized": false,
64
+ "rstrip": true,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "2066": {
69
+ "content": "<extra_id_94>",
70
+ "lstrip": true,
71
+ "normalized": false,
72
+ "rstrip": true,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "2067": {
77
+ "content": "<extra_id_93>",
78
+ "lstrip": true,
79
+ "normalized": false,
80
+ "rstrip": true,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "2068": {
85
+ "content": "<extra_id_92>",
86
+ "lstrip": true,
87
+ "normalized": false,
88
+ "rstrip": true,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "2069": {
93
+ "content": "<extra_id_91>",
94
+ "lstrip": true,
95
+ "normalized": false,
96
+ "rstrip": true,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "2070": {
101
+ "content": "<extra_id_90>",
102
+ "lstrip": true,
103
+ "normalized": false,
104
+ "rstrip": true,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "2071": {
109
+ "content": "<extra_id_89>",
110
+ "lstrip": true,
111
+ "normalized": false,
112
+ "rstrip": true,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "2072": {
117
+ "content": "<extra_id_88>",
118
+ "lstrip": true,
119
+ "normalized": false,
120
+ "rstrip": true,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "2073": {
125
+ "content": "<extra_id_87>",
126
+ "lstrip": true,
127
+ "normalized": false,
128
+ "rstrip": true,
129
+ "single_word": false,
130
+ "special": true
131
+ },
132
+ "2074": {
133
+ "content": "<extra_id_86>",
134
+ "lstrip": true,
135
+ "normalized": false,
136
+ "rstrip": true,
137
+ "single_word": false,
138
+ "special": true
139
+ },
140
+ "2075": {
141
+ "content": "<extra_id_85>",
142
+ "lstrip": true,
143
+ "normalized": false,
144
+ "rstrip": true,
145
+ "single_word": false,
146
+ "special": true
147
+ },
148
+ "2076": {
149
+ "content": "<extra_id_84>",
150
+ "lstrip": true,
151
+ "normalized": false,
152
+ "rstrip": true,
153
+ "single_word": false,
154
+ "special": true
155
+ },
156
+ "2077": {
157
+ "content": "<extra_id_83>",
158
+ "lstrip": true,
159
+ "normalized": false,
160
+ "rstrip": true,
161
+ "single_word": false,
162
+ "special": true
163
+ },
164
+ "2078": {
165
+ "content": "<extra_id_82>",
166
+ "lstrip": true,
167
+ "normalized": false,
168
+ "rstrip": true,
169
+ "single_word": false,
170
+ "special": true
171
+ },
172
+ "2079": {
173
+ "content": "<extra_id_81>",
174
+ "lstrip": true,
175
+ "normalized": false,
176
+ "rstrip": true,
177
+ "single_word": false,
178
+ "special": true
179
+ },
180
+ "2080": {
181
+ "content": "<extra_id_80>",
182
+ "lstrip": true,
183
+ "normalized": false,
184
+ "rstrip": true,
185
+ "single_word": false,
186
+ "special": true
187
+ },
188
+ "2081": {
189
+ "content": "<extra_id_79>",
190
+ "lstrip": true,
191
+ "normalized": false,
192
+ "rstrip": true,
193
+ "single_word": false,
194
+ "special": true
195
+ },
196
+ "2082": {
197
+ "content": "<extra_id_78>",
198
+ "lstrip": true,
199
+ "normalized": false,
200
+ "rstrip": true,
201
+ "single_word": false,
202
+ "special": true
203
+ },
204
+ "2083": {
205
+ "content": "<extra_id_77>",
206
+ "lstrip": true,
207
+ "normalized": false,
208
+ "rstrip": true,
209
+ "single_word": false,
210
+ "special": true
211
+ },
212
+ "2084": {
213
+ "content": "<extra_id_76>",
214
+ "lstrip": true,
215
+ "normalized": false,
216
+ "rstrip": true,
217
+ "single_word": false,
218
+ "special": true
219
+ },
220
+ "2085": {
221
+ "content": "<extra_id_75>",
222
+ "lstrip": true,
223
+ "normalized": false,
224
+ "rstrip": true,
225
+ "single_word": false,
226
+ "special": true
227
+ },
228
+ "2086": {
229
+ "content": "<extra_id_74>",
230
+ "lstrip": true,
231
+ "normalized": false,
232
+ "rstrip": true,
233
+ "single_word": false,
234
+ "special": true
235
+ },
236
+ "2087": {
237
+ "content": "<extra_id_73>",
238
+ "lstrip": true,
239
+ "normalized": false,
240
+ "rstrip": true,
241
+ "single_word": false,
242
+ "special": true
243
+ },
244
+ "2088": {
245
+ "content": "<extra_id_72>",
246
+ "lstrip": true,
247
+ "normalized": false,
248
+ "rstrip": true,
249
+ "single_word": false,
250
+ "special": true
251
+ },
252
+ "2089": {
253
+ "content": "<extra_id_71>",
254
+ "lstrip": true,
255
+ "normalized": false,
256
+ "rstrip": true,
257
+ "single_word": false,
258
+ "special": true
259
+ },
260
+ "2090": {
261
+ "content": "<extra_id_70>",
262
+ "lstrip": true,
263
+ "normalized": false,
264
+ "rstrip": true,
265
+ "single_word": false,
266
+ "special": true
267
+ },
268
+ "2091": {
269
+ "content": "<extra_id_69>",
270
+ "lstrip": true,
271
+ "normalized": false,
272
+ "rstrip": true,
273
+ "single_word": false,
274
+ "special": true
275
+ },
276
+ "2092": {
277
+ "content": "<extra_id_68>",
278
+ "lstrip": true,
279
+ "normalized": false,
280
+ "rstrip": true,
281
+ "single_word": false,
282
+ "special": true
283
+ },
284
+ "2093": {
285
+ "content": "<extra_id_67>",
286
+ "lstrip": true,
287
+ "normalized": false,
288
+ "rstrip": true,
289
+ "single_word": false,
290
+ "special": true
291
+ },
292
+ "2094": {
293
+ "content": "<extra_id_66>",
294
+ "lstrip": true,
295
+ "normalized": false,
296
+ "rstrip": true,
297
+ "single_word": false,
298
+ "special": true
299
+ },
300
+ "2095": {
301
+ "content": "<extra_id_65>",
302
+ "lstrip": true,
303
+ "normalized": false,
304
+ "rstrip": true,
305
+ "single_word": false,
306
+ "special": true
307
+ },
308
+ "2096": {
309
+ "content": "<extra_id_64>",
310
+ "lstrip": true,
311
+ "normalized": false,
312
+ "rstrip": true,
313
+ "single_word": false,
314
+ "special": true
315
+ },
316
+ "2097": {
317
+ "content": "<extra_id_63>",
318
+ "lstrip": true,
319
+ "normalized": false,
320
+ "rstrip": true,
321
+ "single_word": false,
322
+ "special": true
323
+ },
324
+ "2098": {
325
+ "content": "<extra_id_62>",
326
+ "lstrip": true,
327
+ "normalized": false,
328
+ "rstrip": true,
329
+ "single_word": false,
330
+ "special": true
331
+ },
332
+ "2099": {
333
+ "content": "<extra_id_61>",
334
+ "lstrip": true,
335
+ "normalized": false,
336
+ "rstrip": true,
337
+ "single_word": false,
338
+ "special": true
339
+ },
340
+ "2100": {
341
+ "content": "<extra_id_60>",
342
+ "lstrip": true,
343
+ "normalized": false,
344
+ "rstrip": true,
345
+ "single_word": false,
346
+ "special": true
347
+ },
348
+ "2101": {
349
+ "content": "<extra_id_59>",
350
+ "lstrip": true,
351
+ "normalized": false,
352
+ "rstrip": true,
353
+ "single_word": false,
354
+ "special": true
355
+ },
356
+ "2102": {
357
+ "content": "<extra_id_58>",
358
+ "lstrip": true,
359
+ "normalized": false,
360
+ "rstrip": true,
361
+ "single_word": false,
362
+ "special": true
363
+ },
364
+ "2103": {
365
+ "content": "<extra_id_57>",
366
+ "lstrip": true,
367
+ "normalized": false,
368
+ "rstrip": true,
369
+ "single_word": false,
370
+ "special": true
371
+ },
372
+ "2104": {
373
+ "content": "<extra_id_56>",
374
+ "lstrip": true,
375
+ "normalized": false,
376
+ "rstrip": true,
377
+ "single_word": false,
378
+ "special": true
379
+ },
380
+ "2105": {
381
+ "content": "<extra_id_55>",
382
+ "lstrip": true,
383
+ "normalized": false,
384
+ "rstrip": true,
385
+ "single_word": false,
386
+ "special": true
387
+ },
388
+ "2106": {
389
+ "content": "<extra_id_54>",
390
+ "lstrip": true,
391
+ "normalized": false,
392
+ "rstrip": true,
393
+ "single_word": false,
394
+ "special": true
395
+ },
396
+ "2107": {
397
+ "content": "<extra_id_53>",
398
+ "lstrip": true,
399
+ "normalized": false,
400
+ "rstrip": true,
401
+ "single_word": false,
402
+ "special": true
403
+ },
404
+ "2108": {
405
+ "content": "<extra_id_52>",
406
+ "lstrip": true,
407
+ "normalized": false,
408
+ "rstrip": true,
409
+ "single_word": false,
410
+ "special": true
411
+ },
412
+ "2109": {
413
+ "content": "<extra_id_51>",
414
+ "lstrip": true,
415
+ "normalized": false,
416
+ "rstrip": true,
417
+ "single_word": false,
418
+ "special": true
419
+ },
420
+ "2110": {
421
+ "content": "<extra_id_50>",
422
+ "lstrip": true,
423
+ "normalized": false,
424
+ "rstrip": true,
425
+ "single_word": false,
426
+ "special": true
427
+ },
428
+ "2111": {
429
+ "content": "<extra_id_49>",
430
+ "lstrip": true,
431
+ "normalized": false,
432
+ "rstrip": true,
433
+ "single_word": false,
434
+ "special": true
435
+ },
436
+ "2112": {
437
+ "content": "<extra_id_48>",
438
+ "lstrip": true,
439
+ "normalized": false,
440
+ "rstrip": true,
441
+ "single_word": false,
442
+ "special": true
443
+ },
444
+ "2113": {
445
+ "content": "<extra_id_47>",
446
+ "lstrip": true,
447
+ "normalized": false,
448
+ "rstrip": true,
449
+ "single_word": false,
450
+ "special": true
451
+ },
452
+ "2114": {
453
+ "content": "<extra_id_46>",
454
+ "lstrip": true,
455
+ "normalized": false,
456
+ "rstrip": true,
457
+ "single_word": false,
458
+ "special": true
459
+ },
460
+ "2115": {
461
+ "content": "<extra_id_45>",
462
+ "lstrip": true,
463
+ "normalized": false,
464
+ "rstrip": true,
465
+ "single_word": false,
466
+ "special": true
467
+ },
468
+ "2116": {
469
+ "content": "<extra_id_44>",
470
+ "lstrip": true,
471
+ "normalized": false,
472
+ "rstrip": true,
473
+ "single_word": false,
474
+ "special": true
475
+ },
476
+ "2117": {
477
+ "content": "<extra_id_43>",
478
+ "lstrip": true,
479
+ "normalized": false,
480
+ "rstrip": true,
481
+ "single_word": false,
482
+ "special": true
483
+ },
484
+ "2118": {
485
+ "content": "<extra_id_42>",
486
+ "lstrip": true,
487
+ "normalized": false,
488
+ "rstrip": true,
489
+ "single_word": false,
490
+ "special": true
491
+ },
492
+ "2119": {
493
+ "content": "<extra_id_41>",
494
+ "lstrip": true,
495
+ "normalized": false,
496
+ "rstrip": true,
497
+ "single_word": false,
498
+ "special": true
499
+ },
500
+ "2120": {
501
+ "content": "<extra_id_40>",
502
+ "lstrip": true,
503
+ "normalized": false,
504
+ "rstrip": true,
505
+ "single_word": false,
506
+ "special": true
507
+ },
508
+ "2121": {
509
+ "content": "<extra_id_39>",
510
+ "lstrip": true,
511
+ "normalized": false,
512
+ "rstrip": true,
513
+ "single_word": false,
514
+ "special": true
515
+ },
516
+ "2122": {
517
+ "content": "<extra_id_38>",
518
+ "lstrip": true,
519
+ "normalized": false,
520
+ "rstrip": true,
521
+ "single_word": false,
522
+ "special": true
523
+ },
524
+ "2123": {
525
+ "content": "<extra_id_37>",
526
+ "lstrip": true,
527
+ "normalized": false,
528
+ "rstrip": true,
529
+ "single_word": false,
530
+ "special": true
531
+ },
532
+ "2124": {
533
+ "content": "<extra_id_36>",
534
+ "lstrip": true,
535
+ "normalized": false,
536
+ "rstrip": true,
537
+ "single_word": false,
538
+ "special": true
539
+ },
540
+ "2125": {
541
+ "content": "<extra_id_35>",
542
+ "lstrip": true,
543
+ "normalized": false,
544
+ "rstrip": true,
545
+ "single_word": false,
546
+ "special": true
547
+ },
548
+ "2126": {
549
+ "content": "<extra_id_34>",
550
+ "lstrip": true,
551
+ "normalized": false,
552
+ "rstrip": true,
553
+ "single_word": false,
554
+ "special": true
555
+ },
556
+ "2127": {
557
+ "content": "<extra_id_33>",
558
+ "lstrip": true,
559
+ "normalized": false,
560
+ "rstrip": true,
561
+ "single_word": false,
562
+ "special": true
563
+ },
564
+ "2128": {
565
+ "content": "<extra_id_32>",
566
+ "lstrip": true,
567
+ "normalized": false,
568
+ "rstrip": true,
569
+ "single_word": false,
570
+ "special": true
571
+ },
572
+ "2129": {
573
+ "content": "<extra_id_31>",
574
+ "lstrip": true,
575
+ "normalized": false,
576
+ "rstrip": true,
577
+ "single_word": false,
578
+ "special": true
579
+ },
580
+ "2130": {
581
+ "content": "<extra_id_30>",
582
+ "lstrip": true,
583
+ "normalized": false,
584
+ "rstrip": true,
585
+ "single_word": false,
586
+ "special": true
587
+ },
588
+ "2131": {
589
+ "content": "<extra_id_29>",
590
+ "lstrip": true,
591
+ "normalized": false,
592
+ "rstrip": true,
593
+ "single_word": false,
594
+ "special": true
595
+ },
596
+ "2132": {
597
+ "content": "<extra_id_28>",
598
+ "lstrip": true,
599
+ "normalized": false,
600
+ "rstrip": true,
601
+ "single_word": false,
602
+ "special": true
603
+ },
604
+ "2133": {
605
+ "content": "<extra_id_27>",
606
+ "lstrip": true,
607
+ "normalized": false,
608
+ "rstrip": true,
609
+ "single_word": false,
610
+ "special": true
611
+ },
612
+ "2134": {
613
+ "content": "<extra_id_26>",
614
+ "lstrip": true,
615
+ "normalized": false,
616
+ "rstrip": true,
617
+ "single_word": false,
618
+ "special": true
619
+ },
620
+ "2135": {
621
+ "content": "<extra_id_25>",
622
+ "lstrip": true,
623
+ "normalized": false,
624
+ "rstrip": true,
625
+ "single_word": false,
626
+ "special": true
627
+ },
628
+ "2136": {
629
+ "content": "<extra_id_24>",
630
+ "lstrip": true,
631
+ "normalized": false,
632
+ "rstrip": true,
633
+ "single_word": false,
634
+ "special": true
635
+ },
636
+ "2137": {
637
+ "content": "<extra_id_23>",
638
+ "lstrip": true,
639
+ "normalized": false,
640
+ "rstrip": true,
641
+ "single_word": false,
642
+ "special": true
643
+ },
644
+ "2138": {
645
+ "content": "<extra_id_22>",
646
+ "lstrip": true,
647
+ "normalized": false,
648
+ "rstrip": true,
649
+ "single_word": false,
650
+ "special": true
651
+ },
652
+ "2139": {
653
+ "content": "<extra_id_21>",
654
+ "lstrip": true,
655
+ "normalized": false,
656
+ "rstrip": true,
657
+ "single_word": false,
658
+ "special": true
659
+ },
660
+ "2140": {
661
+ "content": "<extra_id_20>",
662
+ "lstrip": true,
663
+ "normalized": false,
664
+ "rstrip": true,
665
+ "single_word": false,
666
+ "special": true
667
+ },
668
+ "2141": {
669
+ "content": "<extra_id_19>",
670
+ "lstrip": true,
671
+ "normalized": false,
672
+ "rstrip": true,
673
+ "single_word": false,
674
+ "special": true
675
+ },
676
+ "2142": {
677
+ "content": "<extra_id_18>",
678
+ "lstrip": true,
679
+ "normalized": false,
680
+ "rstrip": true,
681
+ "single_word": false,
682
+ "special": true
683
+ },
684
+ "2143": {
685
+ "content": "<extra_id_17>",
686
+ "lstrip": true,
687
+ "normalized": false,
688
+ "rstrip": true,
689
+ "single_word": false,
690
+ "special": true
691
+ },
692
+ "2144": {
693
+ "content": "<extra_id_16>",
694
+ "lstrip": true,
695
+ "normalized": false,
696
+ "rstrip": true,
697
+ "single_word": false,
698
+ "special": true
699
+ },
700
+ "2145": {
701
+ "content": "<extra_id_15>",
702
+ "lstrip": true,
703
+ "normalized": false,
704
+ "rstrip": true,
705
+ "single_word": false,
706
+ "special": true
707
+ },
708
+ "2146": {
709
+ "content": "<extra_id_14>",
710
+ "lstrip": true,
711
+ "normalized": false,
712
+ "rstrip": true,
713
+ "single_word": false,
714
+ "special": true
715
+ },
716
+ "2147": {
717
+ "content": "<extra_id_13>",
718
+ "lstrip": true,
719
+ "normalized": false,
720
+ "rstrip": true,
721
+ "single_word": false,
722
+ "special": true
723
+ },
724
+ "2148": {
725
+ "content": "<extra_id_12>",
726
+ "lstrip": true,
727
+ "normalized": false,
728
+ "rstrip": true,
729
+ "single_word": false,
730
+ "special": true
731
+ },
732
+ "2149": {
733
+ "content": "<extra_id_11>",
734
+ "lstrip": true,
735
+ "normalized": false,
736
+ "rstrip": true,
737
+ "single_word": false,
738
+ "special": true
739
+ },
740
+ "2150": {
741
+ "content": "<extra_id_10>",
742
+ "lstrip": true,
743
+ "normalized": false,
744
+ "rstrip": true,
745
+ "single_word": false,
746
+ "special": true
747
+ },
748
+ "2151": {
749
+ "content": "<extra_id_9>",
750
+ "lstrip": true,
751
+ "normalized": false,
752
+ "rstrip": true,
753
+ "single_word": false,
754
+ "special": true
755
+ },
756
+ "2152": {
757
+ "content": "<extra_id_8>",
758
+ "lstrip": true,
759
+ "normalized": false,
760
+ "rstrip": true,
761
+ "single_word": false,
762
+ "special": true
763
+ },
764
+ "2153": {
765
+ "content": "<extra_id_7>",
766
+ "lstrip": true,
767
+ "normalized": false,
768
+ "rstrip": true,
769
+ "single_word": false,
770
+ "special": true
771
+ },
772
+ "2154": {
773
+ "content": "<extra_id_6>",
774
+ "lstrip": true,
775
+ "normalized": false,
776
+ "rstrip": true,
777
+ "single_word": false,
778
+ "special": true
779
+ },
780
+ "2155": {
781
+ "content": "<extra_id_5>",
782
+ "lstrip": true,
783
+ "normalized": false,
784
+ "rstrip": true,
785
+ "single_word": false,
786
+ "special": true
787
+ },
788
+ "2156": {
789
+ "content": "<extra_id_4>",
790
+ "lstrip": true,
791
+ "normalized": false,
792
+ "rstrip": true,
793
+ "single_word": false,
794
+ "special": true
795
+ },
796
+ "2157": {
797
+ "content": "<extra_id_3>",
798
+ "lstrip": true,
799
+ "normalized": false,
800
+ "rstrip": true,
801
+ "single_word": false,
802
+ "special": true
803
+ },
804
+ "2158": {
805
+ "content": "<extra_id_2>",
806
+ "lstrip": true,
807
+ "normalized": false,
808
+ "rstrip": true,
809
+ "single_word": false,
810
+ "special": true
811
+ },
812
+ "2159": {
813
+ "content": "<extra_id_1>",
814
+ "lstrip": true,
815
+ "normalized": false,
816
+ "rstrip": true,
817
+ "single_word": false,
818
+ "special": true
819
+ },
820
+ "2160": {
821
+ "content": "<extra_id_0>",
822
+ "lstrip": true,
823
+ "normalized": false,
824
+ "rstrip": true,
825
+ "single_word": false,
826
+ "special": true
827
+ }
828
+ },
829
+ "additional_special_tokens": [
830
+ "<extra_id_0>",
831
+ "<extra_id_1>",
832
+ "<extra_id_2>",
833
+ "<extra_id_3>",
834
+ "<extra_id_4>",
835
+ "<extra_id_5>",
836
+ "<extra_id_6>",
837
+ "<extra_id_7>",
838
+ "<extra_id_8>",
839
+ "<extra_id_9>",
840
+ "<extra_id_10>",
841
+ "<extra_id_11>",
842
+ "<extra_id_12>",
843
+ "<extra_id_13>",
844
+ "<extra_id_14>",
845
+ "<extra_id_15>",
846
+ "<extra_id_16>",
847
+ "<extra_id_17>",
848
+ "<extra_id_18>",
849
+ "<extra_id_19>",
850
+ "<extra_id_20>",
851
+ "<extra_id_21>",
852
+ "<extra_id_22>",
853
+ "<extra_id_23>",
854
+ "<extra_id_24>",
855
+ "<extra_id_25>",
856
+ "<extra_id_26>",
857
+ "<extra_id_27>",
858
+ "<extra_id_28>",
859
+ "<extra_id_29>",
860
+ "<extra_id_30>",
861
+ "<extra_id_31>",
862
+ "<extra_id_32>",
863
+ "<extra_id_33>",
864
+ "<extra_id_34>",
865
+ "<extra_id_35>",
866
+ "<extra_id_36>",
867
+ "<extra_id_37>",
868
+ "<extra_id_38>",
869
+ "<extra_id_39>",
870
+ "<extra_id_40>",
871
+ "<extra_id_41>",
872
+ "<extra_id_42>",
873
+ "<extra_id_43>",
874
+ "<extra_id_44>",
875
+ "<extra_id_45>",
876
+ "<extra_id_46>",
877
+ "<extra_id_47>",
878
+ "<extra_id_48>",
879
+ "<extra_id_49>",
880
+ "<extra_id_50>",
881
+ "<extra_id_51>",
882
+ "<extra_id_52>",
883
+ "<extra_id_53>",
884
+ "<extra_id_54>",
885
+ "<extra_id_55>",
886
+ "<extra_id_56>",
887
+ "<extra_id_57>",
888
+ "<extra_id_58>",
889
+ "<extra_id_59>",
890
+ "<extra_id_60>",
891
+ "<extra_id_61>",
892
+ "<extra_id_62>",
893
+ "<extra_id_63>",
894
+ "<extra_id_64>",
895
+ "<extra_id_65>",
896
+ "<extra_id_66>",
897
+ "<extra_id_67>",
898
+ "<extra_id_68>",
899
+ "<extra_id_69>",
900
+ "<extra_id_70>",
901
+ "<extra_id_71>",
902
+ "<extra_id_72>",
903
+ "<extra_id_73>",
904
+ "<extra_id_74>",
905
+ "<extra_id_75>",
906
+ "<extra_id_76>",
907
+ "<extra_id_77>",
908
+ "<extra_id_78>",
909
+ "<extra_id_79>",
910
+ "<extra_id_80>",
911
+ "<extra_id_81>",
912
+ "<extra_id_82>",
913
+ "<extra_id_83>",
914
+ "<extra_id_84>",
915
+ "<extra_id_85>",
916
+ "<extra_id_86>",
917
+ "<extra_id_87>",
918
+ "<extra_id_88>",
919
+ "<extra_id_89>",
920
+ "<extra_id_90>",
921
+ "<extra_id_91>",
922
+ "<extra_id_92>",
923
+ "<extra_id_93>",
924
+ "<extra_id_94>",
925
+ "<extra_id_95>",
926
+ "<extra_id_96>",
927
+ "<extra_id_97>",
928
+ "<extra_id_98>",
929
+ "<extra_id_99>"
930
+ ],
931
+ "clean_up_tokenization_spaces": true,
932
+ "eos_token": "</s>",
933
+ "extra_ids": 100,
934
+ "legacy": true,
935
+ "model_max_length": 512,
936
+ "pad_token": "<pad>",
937
+ "sp_model_kwargs": {},
938
+ "tokenizer_class": "T5Tokenizer",
939
+ "unk_token": "<unk>"
940
+ }
trainer_state.json ADDED
@@ -0,0 +1,1125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.13344644508202191,
5
+ "eval_steps": 1000,
6
+ "global_step": 14000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0009531888934430136,
13
+ "grad_norm": 1.0701079368591309,
14
+ "learning_rate": 1.5e-06,
15
+ "loss": 0.2617,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.0019063777868860272,
20
+ "grad_norm": 1.0461440086364746,
21
+ "learning_rate": 3e-06,
22
+ "loss": 0.2595,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.002859566680329041,
27
+ "grad_norm": 1.0249755382537842,
28
+ "learning_rate": 4.5e-06,
29
+ "loss": 0.2595,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 0.0038127555737720543,
34
+ "grad_norm": 0.9327605366706848,
35
+ "learning_rate": 6e-06,
36
+ "loss": 0.2563,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 0.004765944467215068,
41
+ "grad_norm": 0.9439413547515869,
42
+ "learning_rate": 7.5e-06,
43
+ "loss": 0.2589,
44
+ "step": 500
45
+ },
46
+ {
47
+ "epoch": 0.005719133360658082,
48
+ "grad_norm": 0.8729381561279297,
49
+ "learning_rate": 9e-06,
50
+ "loss": 0.2617,
51
+ "step": 600
52
+ },
53
+ {
54
+ "epoch": 0.006672322254101095,
55
+ "grad_norm": 0.9562346935272217,
56
+ "learning_rate": 1.05e-05,
57
+ "loss": 0.259,
58
+ "step": 700
59
+ },
60
+ {
61
+ "epoch": 0.007625511147544109,
62
+ "grad_norm": 1.7502244710922241,
63
+ "learning_rate": 1.2e-05,
64
+ "loss": 0.2551,
65
+ "step": 800
66
+ },
67
+ {
68
+ "epoch": 0.008578700040987123,
69
+ "grad_norm": 0.8447253704071045,
70
+ "learning_rate": 1.3500000000000001e-05,
71
+ "loss": 0.2555,
72
+ "step": 900
73
+ },
74
+ {
75
+ "epoch": 0.009531888934430136,
76
+ "grad_norm": 0.9096837043762207,
77
+ "learning_rate": 1.5e-05,
78
+ "loss": 0.2637,
79
+ "step": 1000
80
+ },
81
+ {
82
+ "epoch": 0.009531888934430136,
83
+ "eval_loss": 0.22202371060848236,
84
+ "eval_runtime": 24.6656,
85
+ "eval_samples_per_second": 608.134,
86
+ "eval_steps_per_second": 9.527,
87
+ "step": 1000
88
+ },
89
+ {
90
+ "epoch": 0.01048507782787315,
91
+ "grad_norm": 0.9705513715744019,
92
+ "learning_rate": 1.65e-05,
93
+ "loss": 0.2614,
94
+ "step": 1100
95
+ },
96
+ {
97
+ "epoch": 0.011438266721316164,
98
+ "grad_norm": 0.9748035669326782,
99
+ "learning_rate": 1.8e-05,
100
+ "loss": 0.2648,
101
+ "step": 1200
102
+ },
103
+ {
104
+ "epoch": 0.012391455614759177,
105
+ "grad_norm": 2.0027875900268555,
106
+ "learning_rate": 1.95e-05,
107
+ "loss": 0.2605,
108
+ "step": 1300
109
+ },
110
+ {
111
+ "epoch": 0.01334464450820219,
112
+ "grad_norm": 1.203764796257019,
113
+ "learning_rate": 2.1e-05,
114
+ "loss": 0.2645,
115
+ "step": 1400
116
+ },
117
+ {
118
+ "epoch": 0.014297833401645204,
119
+ "grad_norm": 1.2857439517974854,
120
+ "learning_rate": 2.25e-05,
121
+ "loss": 0.2622,
122
+ "step": 1500
123
+ },
124
+ {
125
+ "epoch": 0.015251022295088217,
126
+ "grad_norm": 0.969646692276001,
127
+ "learning_rate": 2.4e-05,
128
+ "loss": 0.2604,
129
+ "step": 1600
130
+ },
131
+ {
132
+ "epoch": 0.016204211188531232,
133
+ "grad_norm": 0.8485471606254578,
134
+ "learning_rate": 2.55e-05,
135
+ "loss": 0.2628,
136
+ "step": 1700
137
+ },
138
+ {
139
+ "epoch": 0.017157400081974247,
140
+ "grad_norm": 1.1885377168655396,
141
+ "learning_rate": 2.7000000000000002e-05,
142
+ "loss": 0.2665,
143
+ "step": 1800
144
+ },
145
+ {
146
+ "epoch": 0.018110588975417258,
147
+ "grad_norm": 1.98976469039917,
148
+ "learning_rate": 2.8499999999999998e-05,
149
+ "loss": 0.2723,
150
+ "step": 1900
151
+ },
152
+ {
153
+ "epoch": 0.019063777868860272,
154
+ "grad_norm": 1.0017362833023071,
155
+ "learning_rate": 3e-05,
156
+ "loss": 0.2645,
157
+ "step": 2000
158
+ },
159
+ {
160
+ "epoch": 0.019063777868860272,
161
+ "eval_loss": 0.2264958769083023,
162
+ "eval_runtime": 24.3618,
163
+ "eval_samples_per_second": 615.718,
164
+ "eval_steps_per_second": 9.646,
165
+ "step": 2000
166
+ },
167
+ {
168
+ "epoch": 0.020016966762303287,
169
+ "grad_norm": 1.3095935583114624,
170
+ "learning_rate": 2.9970848597331675e-05,
171
+ "loss": 0.2735,
172
+ "step": 2100
173
+ },
174
+ {
175
+ "epoch": 0.0209701556557463,
176
+ "grad_norm": 1.0084208250045776,
177
+ "learning_rate": 2.9941697194663354e-05,
178
+ "loss": 0.2697,
179
+ "step": 2200
180
+ },
181
+ {
182
+ "epoch": 0.021923344549189313,
183
+ "grad_norm": 0.9595718383789062,
184
+ "learning_rate": 2.9912545791995025e-05,
185
+ "loss": 0.2706,
186
+ "step": 2300
187
+ },
188
+ {
189
+ "epoch": 0.022876533442632328,
190
+ "grad_norm": 1.156947374343872,
191
+ "learning_rate": 2.98833943893267e-05,
192
+ "loss": 0.2664,
193
+ "step": 2400
194
+ },
195
+ {
196
+ "epoch": 0.02382972233607534,
197
+ "grad_norm": 0.9906996488571167,
198
+ "learning_rate": 2.9854242986658374e-05,
199
+ "loss": 0.267,
200
+ "step": 2500
201
+ },
202
+ {
203
+ "epoch": 0.024782911229518353,
204
+ "grad_norm": 1.133239507675171,
205
+ "learning_rate": 2.9825091583990053e-05,
206
+ "loss": 0.2697,
207
+ "step": 2600
208
+ },
209
+ {
210
+ "epoch": 0.025736100122961368,
211
+ "grad_norm": 1.1839542388916016,
212
+ "learning_rate": 2.9795940181321727e-05,
213
+ "loss": 0.2644,
214
+ "step": 2700
215
+ },
216
+ {
217
+ "epoch": 0.02668928901640438,
218
+ "grad_norm": 1.1177607774734497,
219
+ "learning_rate": 2.97667887786534e-05,
220
+ "loss": 0.2649,
221
+ "step": 2800
222
+ },
223
+ {
224
+ "epoch": 0.027642477909847394,
225
+ "grad_norm": 1.0634980201721191,
226
+ "learning_rate": 2.9737637375985073e-05,
227
+ "loss": 0.273,
228
+ "step": 2900
229
+ },
230
+ {
231
+ "epoch": 0.02859566680329041,
232
+ "grad_norm": 1.141790747642517,
233
+ "learning_rate": 2.970848597331675e-05,
234
+ "loss": 0.2717,
235
+ "step": 3000
236
+ },
237
+ {
238
+ "epoch": 0.02859566680329041,
239
+ "eval_loss": 0.23009072244167328,
240
+ "eval_runtime": 25.5443,
241
+ "eval_samples_per_second": 587.214,
242
+ "eval_steps_per_second": 9.2,
243
+ "step": 3000
244
+ },
245
+ {
246
+ "epoch": 0.029548855696733423,
247
+ "grad_norm": 0.8992202281951904,
248
+ "learning_rate": 2.9679334570648426e-05,
249
+ "loss": 0.272,
250
+ "step": 3100
251
+ },
252
+ {
253
+ "epoch": 0.030502044590176434,
254
+ "grad_norm": 1.1783612966537476,
255
+ "learning_rate": 2.96501831679801e-05,
256
+ "loss": 0.2705,
257
+ "step": 3200
258
+ },
259
+ {
260
+ "epoch": 0.03145523348361945,
261
+ "grad_norm": 1.516988754272461,
262
+ "learning_rate": 2.9621031765311772e-05,
263
+ "loss": 0.2696,
264
+ "step": 3300
265
+ },
266
+ {
267
+ "epoch": 0.032408422377062464,
268
+ "grad_norm": 0.9750285148620605,
269
+ "learning_rate": 2.959188036264345e-05,
270
+ "loss": 0.2661,
271
+ "step": 3400
272
+ },
273
+ {
274
+ "epoch": 0.03336161127050548,
275
+ "grad_norm": 1.0874147415161133,
276
+ "learning_rate": 2.9562728959975125e-05,
277
+ "loss": 0.2713,
278
+ "step": 3500
279
+ },
280
+ {
281
+ "epoch": 0.03431480016394849,
282
+ "grad_norm": 1.2503632307052612,
283
+ "learning_rate": 2.95335775573068e-05,
284
+ "loss": 0.2694,
285
+ "step": 3600
286
+ },
287
+ {
288
+ "epoch": 0.0352679890573915,
289
+ "grad_norm": 2.1983683109283447,
290
+ "learning_rate": 2.9504426154638478e-05,
291
+ "loss": 0.2715,
292
+ "step": 3700
293
+ },
294
+ {
295
+ "epoch": 0.036221177950834516,
296
+ "grad_norm": 1.0884830951690674,
297
+ "learning_rate": 2.947527475197015e-05,
298
+ "loss": 0.2671,
299
+ "step": 3800
300
+ },
301
+ {
302
+ "epoch": 0.03717436684427753,
303
+ "grad_norm": 0.9805251955986023,
304
+ "learning_rate": 2.9446123349301824e-05,
305
+ "loss": 0.2705,
306
+ "step": 3900
307
+ },
308
+ {
309
+ "epoch": 0.038127555737720545,
310
+ "grad_norm": 1.0471646785736084,
311
+ "learning_rate": 2.94169719466335e-05,
312
+ "loss": 0.2657,
313
+ "step": 4000
314
+ },
315
+ {
316
+ "epoch": 0.038127555737720545,
317
+ "eval_loss": 0.22619383037090302,
318
+ "eval_runtime": 24.3251,
319
+ "eval_samples_per_second": 616.647,
320
+ "eval_steps_per_second": 9.661,
321
+ "step": 4000
322
+ },
323
+ {
324
+ "epoch": 0.03908074463116356,
325
+ "grad_norm": 1.080304503440857,
326
+ "learning_rate": 2.9387820543965177e-05,
327
+ "loss": 0.2755,
328
+ "step": 4100
329
+ },
330
+ {
331
+ "epoch": 0.040033933524606574,
332
+ "grad_norm": 1.2072677612304688,
333
+ "learning_rate": 2.935866914129685e-05,
334
+ "loss": 0.2666,
335
+ "step": 4200
336
+ },
337
+ {
338
+ "epoch": 0.04098712241804958,
339
+ "grad_norm": 1.1678977012634277,
340
+ "learning_rate": 2.9329517738628523e-05,
341
+ "loss": 0.2708,
342
+ "step": 4300
343
+ },
344
+ {
345
+ "epoch": 0.0419403113114926,
346
+ "grad_norm": 0.9155502319335938,
347
+ "learning_rate": 2.9300366335960198e-05,
348
+ "loss": 0.2701,
349
+ "step": 4400
350
+ },
351
+ {
352
+ "epoch": 0.04289350020493561,
353
+ "grad_norm": 1.022687315940857,
354
+ "learning_rate": 2.9271214933291876e-05,
355
+ "loss": 0.276,
356
+ "step": 4500
357
+ },
358
+ {
359
+ "epoch": 0.043846689098378626,
360
+ "grad_norm": 1.0507577657699585,
361
+ "learning_rate": 2.924206353062355e-05,
362
+ "loss": 0.2695,
363
+ "step": 4600
364
+ },
365
+ {
366
+ "epoch": 0.04479987799182164,
367
+ "grad_norm": 0.9346485137939453,
368
+ "learning_rate": 2.9212912127955225e-05,
369
+ "loss": 0.2715,
370
+ "step": 4700
371
+ },
372
+ {
373
+ "epoch": 0.045753066885264655,
374
+ "grad_norm": 1.0042835474014282,
375
+ "learning_rate": 2.9183760725286897e-05,
376
+ "loss": 0.2671,
377
+ "step": 4800
378
+ },
379
+ {
380
+ "epoch": 0.04670625577870767,
381
+ "grad_norm": 1.106454610824585,
382
+ "learning_rate": 2.9154609322618575e-05,
383
+ "loss": 0.2666,
384
+ "step": 4900
385
+ },
386
+ {
387
+ "epoch": 0.04765944467215068,
388
+ "grad_norm": 0.911589503288269,
389
+ "learning_rate": 2.912545791995025e-05,
390
+ "loss": 0.264,
391
+ "step": 5000
392
+ },
393
+ {
394
+ "epoch": 0.04765944467215068,
395
+ "eval_loss": 0.22571362555027008,
396
+ "eval_runtime": 24.0986,
397
+ "eval_samples_per_second": 622.442,
398
+ "eval_steps_per_second": 9.752,
399
+ "step": 5000
400
+ },
401
+ {
402
+ "epoch": 0.04861263356559369,
403
+ "grad_norm": 0.8723756670951843,
404
+ "learning_rate": 2.9096306517281924e-05,
405
+ "loss": 0.264,
406
+ "step": 5100
407
+ },
408
+ {
409
+ "epoch": 0.04956582245903671,
410
+ "grad_norm": 1.034590482711792,
411
+ "learning_rate": 2.90671551146136e-05,
412
+ "loss": 0.2767,
413
+ "step": 5200
414
+ },
415
+ {
416
+ "epoch": 0.05051901135247972,
417
+ "grad_norm": 1.0665106773376465,
418
+ "learning_rate": 2.9038003711945274e-05,
419
+ "loss": 0.2676,
420
+ "step": 5300
421
+ },
422
+ {
423
+ "epoch": 0.051472200245922736,
424
+ "grad_norm": 0.9242556095123291,
425
+ "learning_rate": 2.900885230927695e-05,
426
+ "loss": 0.2699,
427
+ "step": 5400
428
+ },
429
+ {
430
+ "epoch": 0.05242538913936575,
431
+ "grad_norm": 1.1992926597595215,
432
+ "learning_rate": 2.8979700906608623e-05,
433
+ "loss": 0.2682,
434
+ "step": 5500
435
+ },
436
+ {
437
+ "epoch": 0.05337857803280876,
438
+ "grad_norm": 0.9543828964233398,
439
+ "learning_rate": 2.89505495039403e-05,
440
+ "loss": 0.2713,
441
+ "step": 5600
442
+ },
443
+ {
444
+ "epoch": 0.05433176692625177,
445
+ "grad_norm": 0.9702574014663696,
446
+ "learning_rate": 2.8921398101271973e-05,
447
+ "loss": 0.2663,
448
+ "step": 5700
449
+ },
450
+ {
451
+ "epoch": 0.05528495581969479,
452
+ "grad_norm": 0.9306678175926208,
453
+ "learning_rate": 2.8892246698603647e-05,
454
+ "loss": 0.2712,
455
+ "step": 5800
456
+ },
457
+ {
458
+ "epoch": 0.0562381447131378,
459
+ "grad_norm": 1.2940869331359863,
460
+ "learning_rate": 2.8863095295935322e-05,
461
+ "loss": 0.2732,
462
+ "step": 5900
463
+ },
464
+ {
465
+ "epoch": 0.05719133360658082,
466
+ "grad_norm": 0.8944372534751892,
467
+ "learning_rate": 2.8833943893267e-05,
468
+ "loss": 0.2675,
469
+ "step": 6000
470
+ },
471
+ {
472
+ "epoch": 0.05719133360658082,
473
+ "eval_loss": 0.22631041705608368,
474
+ "eval_runtime": 24.2322,
475
+ "eval_samples_per_second": 619.011,
476
+ "eval_steps_per_second": 9.698,
477
+ "step": 6000
478
+ },
479
+ {
480
+ "epoch": 0.05814452250002383,
481
+ "grad_norm": 1.1152732372283936,
482
+ "learning_rate": 2.8804792490598675e-05,
483
+ "loss": 0.2632,
484
+ "step": 6100
485
+ },
486
+ {
487
+ "epoch": 0.05909771139346685,
488
+ "grad_norm": 0.90058833360672,
489
+ "learning_rate": 2.8775641087930346e-05,
490
+ "loss": 0.2677,
491
+ "step": 6200
492
+ },
493
+ {
494
+ "epoch": 0.060050900286909854,
495
+ "grad_norm": 0.9290627241134644,
496
+ "learning_rate": 2.874648968526202e-05,
497
+ "loss": 0.2667,
498
+ "step": 6300
499
+ },
500
+ {
501
+ "epoch": 0.06100408918035287,
502
+ "grad_norm": 1.0167937278747559,
503
+ "learning_rate": 2.87173382825937e-05,
504
+ "loss": 0.2658,
505
+ "step": 6400
506
+ },
507
+ {
508
+ "epoch": 0.061957278073795884,
509
+ "grad_norm": 1.0440782308578491,
510
+ "learning_rate": 2.8688186879925374e-05,
511
+ "loss": 0.2672,
512
+ "step": 6500
513
+ },
514
+ {
515
+ "epoch": 0.0629104669672389,
516
+ "grad_norm": 1.0155839920043945,
517
+ "learning_rate": 2.865903547725705e-05,
518
+ "loss": 0.2657,
519
+ "step": 6600
520
+ },
521
+ {
522
+ "epoch": 0.0638636558606819,
523
+ "grad_norm": 0.879859209060669,
524
+ "learning_rate": 2.862988407458872e-05,
525
+ "loss": 0.2674,
526
+ "step": 6700
527
+ },
528
+ {
529
+ "epoch": 0.06481684475412493,
530
+ "grad_norm": 0.9081212878227234,
531
+ "learning_rate": 2.8600732671920398e-05,
532
+ "loss": 0.2644,
533
+ "step": 6800
534
+ },
535
+ {
536
+ "epoch": 0.06577003364756794,
537
+ "grad_norm": 1.1635853052139282,
538
+ "learning_rate": 2.8571581269252073e-05,
539
+ "loss": 0.2609,
540
+ "step": 6900
541
+ },
542
+ {
543
+ "epoch": 0.06672322254101096,
544
+ "grad_norm": 1.0756968259811401,
545
+ "learning_rate": 2.8542429866583747e-05,
546
+ "loss": 0.2682,
547
+ "step": 7000
548
+ },
549
+ {
550
+ "epoch": 0.06672322254101096,
551
+ "eval_loss": 0.22241491079330444,
552
+ "eval_runtime": 25.327,
553
+ "eval_samples_per_second": 592.253,
554
+ "eval_steps_per_second": 9.279,
555
+ "step": 7000
556
+ },
557
+ {
558
+ "epoch": 0.06767641143445396,
559
+ "grad_norm": 1.0364997386932373,
560
+ "learning_rate": 2.8513278463915425e-05,
561
+ "loss": 0.2651,
562
+ "step": 7100
563
+ },
564
+ {
565
+ "epoch": 0.06862960032789699,
566
+ "grad_norm": 1.0817292928695679,
567
+ "learning_rate": 2.8484127061247097e-05,
568
+ "loss": 0.2634,
569
+ "step": 7200
570
+ },
571
+ {
572
+ "epoch": 0.06958278922134,
573
+ "grad_norm": 1.052465796470642,
574
+ "learning_rate": 2.845497565857877e-05,
575
+ "loss": 0.2672,
576
+ "step": 7300
577
+ },
578
+ {
579
+ "epoch": 0.070535978114783,
580
+ "grad_norm": 0.8442723155021667,
581
+ "learning_rate": 2.8425824255910446e-05,
582
+ "loss": 0.2709,
583
+ "step": 7400
584
+ },
585
+ {
586
+ "epoch": 0.07148916700822602,
587
+ "grad_norm": 1.104926347732544,
588
+ "learning_rate": 2.8396672853242124e-05,
589
+ "loss": 0.2617,
590
+ "step": 7500
591
+ },
592
+ {
593
+ "epoch": 0.07244235590166903,
594
+ "grad_norm": 1.0135023593902588,
595
+ "learning_rate": 2.83675214505738e-05,
596
+ "loss": 0.2625,
597
+ "step": 7600
598
+ },
599
+ {
600
+ "epoch": 0.07339554479511205,
601
+ "grad_norm": 0.9307543039321899,
602
+ "learning_rate": 2.833837004790547e-05,
603
+ "loss": 0.2671,
604
+ "step": 7700
605
+ },
606
+ {
607
+ "epoch": 0.07434873368855506,
608
+ "grad_norm": 1.5013054609298706,
609
+ "learning_rate": 2.8309218645237145e-05,
610
+ "loss": 0.2656,
611
+ "step": 7800
612
+ },
613
+ {
614
+ "epoch": 0.07530192258199807,
615
+ "grad_norm": 0.923324465751648,
616
+ "learning_rate": 2.8280067242568823e-05,
617
+ "loss": 0.2607,
618
+ "step": 7900
619
+ },
620
+ {
621
+ "epoch": 0.07625511147544109,
622
+ "grad_norm": 1.065769076347351,
623
+ "learning_rate": 2.8250915839900498e-05,
624
+ "loss": 0.2641,
625
+ "step": 8000
626
+ },
627
+ {
628
+ "epoch": 0.07625511147544109,
629
+ "eval_loss": 0.22064544260501862,
630
+ "eval_runtime": 25.6245,
631
+ "eval_samples_per_second": 585.378,
632
+ "eval_steps_per_second": 9.171,
633
+ "step": 8000
634
+ },
635
+ {
636
+ "epoch": 0.0772083003688841,
637
+ "grad_norm": 1.053281545639038,
638
+ "learning_rate": 2.8221764437232173e-05,
639
+ "loss": 0.2633,
640
+ "step": 8100
641
+ },
642
+ {
643
+ "epoch": 0.07816148926232712,
644
+ "grad_norm": 1.0560704469680786,
645
+ "learning_rate": 2.8192613034563844e-05,
646
+ "loss": 0.2602,
647
+ "step": 8200
648
+ },
649
+ {
650
+ "epoch": 0.07911467815577013,
651
+ "grad_norm": 1.0632127523422241,
652
+ "learning_rate": 2.8163461631895522e-05,
653
+ "loss": 0.2647,
654
+ "step": 8300
655
+ },
656
+ {
657
+ "epoch": 0.08006786704921315,
658
+ "grad_norm": 1.0002626180648804,
659
+ "learning_rate": 2.8134310229227197e-05,
660
+ "loss": 0.2654,
661
+ "step": 8400
662
+ },
663
+ {
664
+ "epoch": 0.08102105594265616,
665
+ "grad_norm": 1.1899933815002441,
666
+ "learning_rate": 2.8105158826558872e-05,
667
+ "loss": 0.2631,
668
+ "step": 8500
669
+ },
670
+ {
671
+ "epoch": 0.08197424483609916,
672
+ "grad_norm": 0.9177943468093872,
673
+ "learning_rate": 2.807600742389055e-05,
674
+ "loss": 0.264,
675
+ "step": 8600
676
+ },
677
+ {
678
+ "epoch": 0.08292743372954219,
679
+ "grad_norm": 1.0969672203063965,
680
+ "learning_rate": 2.804685602122222e-05,
681
+ "loss": 0.2663,
682
+ "step": 8700
683
+ },
684
+ {
685
+ "epoch": 0.0838806226229852,
686
+ "grad_norm": 0.9465392231941223,
687
+ "learning_rate": 2.8017704618553896e-05,
688
+ "loss": 0.2599,
689
+ "step": 8800
690
+ },
691
+ {
692
+ "epoch": 0.08483381151642821,
693
+ "grad_norm": 1.1491124629974365,
694
+ "learning_rate": 2.798855321588557e-05,
695
+ "loss": 0.2616,
696
+ "step": 8900
697
+ },
698
+ {
699
+ "epoch": 0.08578700040987122,
700
+ "grad_norm": 1.040123701095581,
701
+ "learning_rate": 2.795940181321725e-05,
702
+ "loss": 0.2611,
703
+ "step": 9000
704
+ },
705
+ {
706
+ "epoch": 0.08578700040987122,
707
+ "eval_loss": 0.22252394258975983,
708
+ "eval_runtime": 24.4254,
709
+ "eval_samples_per_second": 614.114,
710
+ "eval_steps_per_second": 9.621,
711
+ "step": 9000
712
+ },
713
+ {
714
+ "epoch": 0.08674018930331424,
715
+ "grad_norm": 0.8041715621948242,
716
+ "learning_rate": 2.7930250410548923e-05,
717
+ "loss": 0.2597,
718
+ "step": 9100
719
+ },
720
+ {
721
+ "epoch": 0.08769337819675725,
722
+ "grad_norm": 1.2013587951660156,
723
+ "learning_rate": 2.7901099007880595e-05,
724
+ "loss": 0.2627,
725
+ "step": 9200
726
+ },
727
+ {
728
+ "epoch": 0.08864656709020026,
729
+ "grad_norm": 0.8449276089668274,
730
+ "learning_rate": 2.787194760521227e-05,
731
+ "loss": 0.2694,
732
+ "step": 9300
733
+ },
734
+ {
735
+ "epoch": 0.08959975598364328,
736
+ "grad_norm": 0.957938015460968,
737
+ "learning_rate": 2.7842796202543948e-05,
738
+ "loss": 0.2646,
739
+ "step": 9400
740
+ },
741
+ {
742
+ "epoch": 0.09055294487708629,
743
+ "grad_norm": 0.9442753195762634,
744
+ "learning_rate": 2.7813644799875622e-05,
745
+ "loss": 0.2618,
746
+ "step": 9500
747
+ },
748
+ {
749
+ "epoch": 0.09150613377052931,
750
+ "grad_norm": 1.0630254745483398,
751
+ "learning_rate": 2.7784493397207297e-05,
752
+ "loss": 0.267,
753
+ "step": 9600
754
+ },
755
+ {
756
+ "epoch": 0.09245932266397232,
757
+ "grad_norm": 0.9763880372047424,
758
+ "learning_rate": 2.775534199453897e-05,
759
+ "loss": 0.2631,
760
+ "step": 9700
761
+ },
762
+ {
763
+ "epoch": 0.09341251155741534,
764
+ "grad_norm": 1.059673547744751,
765
+ "learning_rate": 2.7726190591870647e-05,
766
+ "loss": 0.264,
767
+ "step": 9800
768
+ },
769
+ {
770
+ "epoch": 0.09436570045085835,
771
+ "grad_norm": 1.0772706270217896,
772
+ "learning_rate": 2.769703918920232e-05,
773
+ "loss": 0.26,
774
+ "step": 9900
775
+ },
776
+ {
777
+ "epoch": 0.09531888934430136,
778
+ "grad_norm": 0.9500916600227356,
779
+ "learning_rate": 2.7667887786533996e-05,
780
+ "loss": 0.2603,
781
+ "step": 10000
782
+ },
783
+ {
784
+ "epoch": 0.09531888934430136,
785
+ "eval_loss": 0.22107724845409393,
786
+ "eval_runtime": 24.1253,
787
+ "eval_samples_per_second": 621.753,
788
+ "eval_steps_per_second": 9.741,
789
+ "step": 10000
790
+ },
791
+ {
792
+ "epoch": 0.09627207823774438,
793
+ "grad_norm": 0.7942706346511841,
794
+ "learning_rate": 2.7639027897892354e-05,
795
+ "loss": 0.258,
796
+ "step": 10100
797
+ },
798
+ {
799
+ "epoch": 0.09722526713118738,
800
+ "grad_norm": 1.1196712255477905,
801
+ "learning_rate": 2.7610168009250712e-05,
802
+ "loss": 0.2594,
803
+ "step": 10200
804
+ },
805
+ {
806
+ "epoch": 0.0981784560246304,
807
+ "grad_norm": 0.9647284746170044,
808
+ "learning_rate": 2.7581016606582387e-05,
809
+ "loss": 0.2645,
810
+ "step": 10300
811
+ },
812
+ {
813
+ "epoch": 0.09913164491807341,
814
+ "grad_norm": 1.0983389616012573,
815
+ "learning_rate": 2.7551865203914065e-05,
816
+ "loss": 0.2589,
817
+ "step": 10400
818
+ },
819
+ {
820
+ "epoch": 0.10008483381151642,
821
+ "grad_norm": 0.8184943795204163,
822
+ "learning_rate": 2.7522713801245736e-05,
823
+ "loss": 0.2604,
824
+ "step": 10500
825
+ },
826
+ {
827
+ "epoch": 0.10103802270495944,
828
+ "grad_norm": 1.0684343576431274,
829
+ "learning_rate": 2.749356239857741e-05,
830
+ "loss": 0.2602,
831
+ "step": 10600
832
+ },
833
+ {
834
+ "epoch": 0.10199121159840245,
835
+ "grad_norm": 0.9852308034896851,
836
+ "learning_rate": 2.7464410995909086e-05,
837
+ "loss": 0.2688,
838
+ "step": 10700
839
+ },
840
+ {
841
+ "epoch": 0.10294440049184547,
842
+ "grad_norm": 0.8270373940467834,
843
+ "learning_rate": 2.7435259593240764e-05,
844
+ "loss": 0.2601,
845
+ "step": 10800
846
+ },
847
+ {
848
+ "epoch": 0.10389758938528848,
849
+ "grad_norm": 0.9181864857673645,
850
+ "learning_rate": 2.740610819057244e-05,
851
+ "loss": 0.259,
852
+ "step": 10900
853
+ },
854
+ {
855
+ "epoch": 0.1048507782787315,
856
+ "grad_norm": 0.8947911858558655,
857
+ "learning_rate": 2.737695678790411e-05,
858
+ "loss": 0.2616,
859
+ "step": 11000
860
+ },
861
+ {
862
+ "epoch": 0.1048507782787315,
863
+ "eval_loss": 0.22300027310848236,
864
+ "eval_runtime": 26.4519,
865
+ "eval_samples_per_second": 567.068,
866
+ "eval_steps_per_second": 8.884,
867
+ "step": 11000
868
+ },
869
+ {
870
+ "epoch": 0.10580396717217451,
871
+ "grad_norm": 1.19639253616333,
872
+ "learning_rate": 2.7347805385235785e-05,
873
+ "loss": 0.2624,
874
+ "step": 11100
875
+ },
876
+ {
877
+ "epoch": 0.10675715606561752,
878
+ "grad_norm": 1.3614460229873657,
879
+ "learning_rate": 2.7318653982567463e-05,
880
+ "loss": 0.2578,
881
+ "step": 11200
882
+ },
883
+ {
884
+ "epoch": 0.10771034495906054,
885
+ "grad_norm": 0.8842675089836121,
886
+ "learning_rate": 2.7289502579899138e-05,
887
+ "loss": 0.259,
888
+ "step": 11300
889
+ },
890
+ {
891
+ "epoch": 0.10866353385250355,
892
+ "grad_norm": 1.1543840169906616,
893
+ "learning_rate": 2.7260351177230812e-05,
894
+ "loss": 0.2594,
895
+ "step": 11400
896
+ },
897
+ {
898
+ "epoch": 0.10961672274594657,
899
+ "grad_norm": 1.1461540460586548,
900
+ "learning_rate": 2.7231199774562484e-05,
901
+ "loss": 0.2576,
902
+ "step": 11500
903
+ },
904
+ {
905
+ "epoch": 0.11056991163938958,
906
+ "grad_norm": 0.9683176279067993,
907
+ "learning_rate": 2.7202048371894162e-05,
908
+ "loss": 0.2597,
909
+ "step": 11600
910
+ },
911
+ {
912
+ "epoch": 0.1115231005328326,
913
+ "grad_norm": 1.1039471626281738,
914
+ "learning_rate": 2.7172896969225837e-05,
915
+ "loss": 0.2586,
916
+ "step": 11700
917
+ },
918
+ {
919
+ "epoch": 0.1124762894262756,
920
+ "grad_norm": 0.9412834644317627,
921
+ "learning_rate": 2.714374556655751e-05,
922
+ "loss": 0.2573,
923
+ "step": 11800
924
+ },
925
+ {
926
+ "epoch": 0.11342947831971861,
927
+ "grad_norm": 1.1193273067474365,
928
+ "learning_rate": 2.711459416388919e-05,
929
+ "loss": 0.2564,
930
+ "step": 11900
931
+ },
932
+ {
933
+ "epoch": 0.11438266721316163,
934
+ "grad_norm": 0.9070214033126831,
935
+ "learning_rate": 2.708544276122086e-05,
936
+ "loss": 0.2598,
937
+ "step": 12000
938
+ },
939
+ {
940
+ "epoch": 0.11438266721316163,
941
+ "eval_loss": 0.21802841126918793,
942
+ "eval_runtime": 24.3781,
943
+ "eval_samples_per_second": 615.305,
944
+ "eval_steps_per_second": 9.64,
945
+ "step": 12000
946
+ },
947
+ {
948
+ "epoch": 0.11533585610660464,
949
+ "grad_norm": 0.9957073330879211,
950
+ "learning_rate": 2.7056291358552536e-05,
951
+ "loss": 0.2582,
952
+ "step": 12100
953
+ },
954
+ {
955
+ "epoch": 0.11628904500004766,
956
+ "grad_norm": 0.9560794234275818,
957
+ "learning_rate": 2.702713995588421e-05,
958
+ "loss": 0.2624,
959
+ "step": 12200
960
+ },
961
+ {
962
+ "epoch": 0.11724223389349067,
963
+ "grad_norm": 1.0625020265579224,
964
+ "learning_rate": 2.699798855321589e-05,
965
+ "loss": 0.2606,
966
+ "step": 12300
967
+ },
968
+ {
969
+ "epoch": 0.1181954227869337,
970
+ "grad_norm": 1.2022795677185059,
971
+ "learning_rate": 2.6968837150547563e-05,
972
+ "loss": 0.2577,
973
+ "step": 12400
974
+ },
975
+ {
976
+ "epoch": 0.1191486116803767,
977
+ "grad_norm": 1.005925178527832,
978
+ "learning_rate": 2.6939685747879234e-05,
979
+ "loss": 0.2609,
980
+ "step": 12500
981
+ },
982
+ {
983
+ "epoch": 0.12010180057381971,
984
+ "grad_norm": 1.0519824028015137,
985
+ "learning_rate": 2.691053434521091e-05,
986
+ "loss": 0.2653,
987
+ "step": 12600
988
+ },
989
+ {
990
+ "epoch": 0.12105498946726273,
991
+ "grad_norm": 1.0782413482666016,
992
+ "learning_rate": 2.6881382942542587e-05,
993
+ "loss": 0.2537,
994
+ "step": 12700
995
+ },
996
+ {
997
+ "epoch": 0.12200817836070574,
998
+ "grad_norm": 0.9406309723854065,
999
+ "learning_rate": 2.6852231539874262e-05,
1000
+ "loss": 0.262,
1001
+ "step": 12800
1002
+ },
1003
+ {
1004
+ "epoch": 0.12296136725414876,
1005
+ "grad_norm": 0.922545850276947,
1006
+ "learning_rate": 2.682337165123262e-05,
1007
+ "loss": 0.2581,
1008
+ "step": 12900
1009
+ },
1010
+ {
1011
+ "epoch": 0.12391455614759177,
1012
+ "grad_norm": 0.8488488793373108,
1013
+ "learning_rate": 2.6794220248564295e-05,
1014
+ "loss": 0.2611,
1015
+ "step": 13000
1016
+ },
1017
+ {
1018
+ "epoch": 0.12391455614759177,
1019
+ "eval_loss": 0.22087305784225464,
1020
+ "eval_runtime": 23.9914,
1021
+ "eval_samples_per_second": 625.224,
1022
+ "eval_steps_per_second": 9.795,
1023
+ "step": 13000
1024
+ },
1025
+ {
1026
+ "epoch": 0.12486774504103478,
1027
+ "grad_norm": 0.9024129509925842,
1028
+ "learning_rate": 2.6765360359922653e-05,
1029
+ "loss": 0.2604,
1030
+ "step": 13100
1031
+ },
1032
+ {
1033
+ "epoch": 0.1258209339344778,
1034
+ "grad_norm": 0.9496759176254272,
1035
+ "learning_rate": 2.6736208957254328e-05,
1036
+ "loss": 0.2552,
1037
+ "step": 13200
1038
+ },
1039
+ {
1040
+ "epoch": 0.1267741228279208,
1041
+ "grad_norm": 1.0905983448028564,
1042
+ "learning_rate": 2.6707057554586002e-05,
1043
+ "loss": 0.2538,
1044
+ "step": 13300
1045
+ },
1046
+ {
1047
+ "epoch": 0.1277273117213638,
1048
+ "grad_norm": 1.1556366682052612,
1049
+ "learning_rate": 2.6677906151917677e-05,
1050
+ "loss": 0.2585,
1051
+ "step": 13400
1052
+ },
1053
+ {
1054
+ "epoch": 0.12868050061480685,
1055
+ "grad_norm": 1.0274028778076172,
1056
+ "learning_rate": 2.6648754749249352e-05,
1057
+ "loss": 0.2546,
1058
+ "step": 13500
1059
+ },
1060
+ {
1061
+ "epoch": 0.12963368950824986,
1062
+ "grad_norm": 0.9366750717163086,
1063
+ "learning_rate": 2.6619603346581027e-05,
1064
+ "loss": 0.2562,
1065
+ "step": 13600
1066
+ },
1067
+ {
1068
+ "epoch": 0.13058687840169286,
1069
+ "grad_norm": 0.9076129198074341,
1070
+ "learning_rate": 2.65904519439127e-05,
1071
+ "loss": 0.2567,
1072
+ "step": 13700
1073
+ },
1074
+ {
1075
+ "epoch": 0.13154006729513587,
1076
+ "grad_norm": 0.9610471725463867,
1077
+ "learning_rate": 2.6561300541244376e-05,
1078
+ "loss": 0.2528,
1079
+ "step": 13800
1080
+ },
1081
+ {
1082
+ "epoch": 0.13249325618857888,
1083
+ "grad_norm": 1.2852675914764404,
1084
+ "learning_rate": 2.653214913857605e-05,
1085
+ "loss": 0.2511,
1086
+ "step": 13900
1087
+ },
1088
+ {
1089
+ "epoch": 0.13344644508202191,
1090
+ "grad_norm": 0.8626914024353027,
1091
+ "learning_rate": 2.6502997735907726e-05,
1092
+ "loss": 0.2507,
1093
+ "step": 14000
1094
+ },
1095
+ {
1096
+ "epoch": 0.13344644508202191,
1097
+ "eval_loss": 0.21873866021633148,
1098
+ "eval_runtime": 23.5023,
1099
+ "eval_samples_per_second": 638.235,
1100
+ "eval_steps_per_second": 9.999,
1101
+ "step": 14000
1102
+ }
1103
+ ],
1104
+ "logging_steps": 100,
1105
+ "max_steps": 104911,
1106
+ "num_input_tokens_seen": 0,
1107
+ "num_train_epochs": 1,
1108
+ "save_steps": 1000,
1109
+ "stateful_callbacks": {
1110
+ "TrainerControl": {
1111
+ "args": {
1112
+ "should_epoch_stop": false,
1113
+ "should_evaluate": false,
1114
+ "should_log": false,
1115
+ "should_save": true,
1116
+ "should_training_stop": false
1117
+ },
1118
+ "attributes": {}
1119
+ }
1120
+ },
1121
+ "total_flos": 3.96629250834432e+16,
1122
+ "train_batch_size": 64,
1123
+ "trial_name": null,
1124
+ "trial_params": null
1125
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d32cce1a13996e4f7cefbd8e39686844d159861ca11a5837ea8148b57d593493
3
+ size 5112