zijuncheng commited on
Commit
d5b64cc
·
unverified ·
1 Parent(s): a38dd56

initial files

Browse files
Files changed (4) hide show
  1. README.md +70 -0
  2. tokenizer.json +249 -0
  3. tokenizer_config.json +43 -0
  4. w2p_bart.onnx +3 -0
README.md ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### How to load the model and run inferences
2
+
3
+ Download all the files to a local directory ```model_dir```
4
+
5
+ #### Initiate ONNX Session
6
+
7
+ ```python
8
+ from torch.onnx import export
9
+ import onnxruntime as ort
10
+ from transformers import PreTrainedTokenizerFast
11
+
12
+ session = ort.InferenceSession("w2p_bart.onnx")
13
+ tokenizer = PreTrainedTokenizerFast.from_pretrained(model_dir)
14
+
15
+ bos_token_id = tokenizer.bos_token_id
16
+ eos_token_id = tokenizer.eos_token_id
17
+
18
+ ```
19
+
20
+ #### Batch Inference
21
+
22
+
23
+ ```python
24
+ def g2p_onnx_batch(text, max_len=16):
25
+ # 1. Preprocess: tokenize and space-out characters
26
+ words = text.strip().split()
27
+ spaced_words = [" ".join(list(word)) for word in words]
28
+
29
+ encoded = tokenizer(spaced_words, return_tensors="np", padding=True, truncation=True, max_length=32)
30
+ input_ids = encoded["input_ids"]
31
+ attention_mask = encoded["attention_mask"]
32
+
33
+ batch_size = input_ids.shape[0]
34
+ decoder_input_ids = np.full((batch_size, 1), bos_token_id, dtype=np.int64)
35
+
36
+ finished = np.zeros(batch_size, dtype=bool)
37
+
38
+ for _ in range(max_len):
39
+ ort_inputs = {
40
+ "input_ids": input_ids,
41
+ "attention_mask": attention_mask,
42
+ "decoder_input_ids": decoder_input_ids
43
+ }
44
+ logits = session.run(["logits"], ort_inputs)[0]
45
+ next_token_logits = logits[:, -1, :]
46
+ next_token_ids = np.argmax(next_token_logits, axis=-1)
47
+
48
+ decoder_input_ids = np.concatenate([decoder_input_ids, next_token_ids[:, None]], axis=1)
49
+
50
+ finished |= (next_token_ids == eos_token_id)
51
+ if finished.all():
52
+ break
53
+
54
+ decoded = tokenizer.batch_decode(decoder_input_ids, skip_special_tokens=True)
55
+ phonemes = [r.replace(" ", "") for r in decoded]
56
+ return " ".join(phonemes)
57
+ ```
58
+
59
+ Example:
60
+
61
+ ```python
62
+ result = g2p_onnx_batch("banana apple question")
63
+ print(result)
64
+ ```
65
+
66
+ This should return:
67
+
68
+ ```python
69
+ bənˈænə ˈæpᵊl kwˈɛsʧᵊn
70
+ ```
tokenizer.json ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 32,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
+ "padding": {
10
+ "strategy": "BatchLongest",
11
+ "direction": "Right",
12
+ "pad_to_multiple_of": null,
13
+ "pad_id": 1,
14
+ "pad_type_id": 0,
15
+ "pad_token": "<pad>"
16
+ },
17
+ "added_tokens": [
18
+ {
19
+ "id": 0,
20
+ "content": "<unk>",
21
+ "single_word": false,
22
+ "lstrip": false,
23
+ "rstrip": false,
24
+ "normalized": false,
25
+ "special": true
26
+ },
27
+ {
28
+ "id": 1,
29
+ "content": "<pad>",
30
+ "single_word": false,
31
+ "lstrip": false,
32
+ "rstrip": false,
33
+ "normalized": false,
34
+ "special": true
35
+ },
36
+ {
37
+ "id": 2,
38
+ "content": "<s>",
39
+ "single_word": false,
40
+ "lstrip": false,
41
+ "rstrip": false,
42
+ "normalized": false,
43
+ "special": true
44
+ },
45
+ {
46
+ "id": 3,
47
+ "content": "</s>",
48
+ "single_word": false,
49
+ "lstrip": false,
50
+ "rstrip": false,
51
+ "normalized": false,
52
+ "special": true
53
+ }
54
+ ],
55
+ "normalizer": null,
56
+ "pre_tokenizer": {
57
+ "type": "Whitespace"
58
+ },
59
+ "post_processor": null,
60
+ "decoder": null,
61
+ "model": {
62
+ "type": "WordLevel",
63
+ "vocab": {
64
+ "<unk>": 0,
65
+ "<pad>": 1,
66
+ "<s>": 2,
67
+ "</s>": 3,
68
+ "Ó": 4,
69
+ "ł": 5,
70
+ "ɔ": 6,
71
+ "ᵻ": 7,
72
+ "Q": 8,
73
+ "h": 9,
74
+ "F": 10,
75
+ "f": 11,
76
+ "J": 12,
77
+ "¹": 13,
78
+ "*": 14,
79
+ "\"": 15,
80
+ "@": 16,
81
+ "©": 17,
82
+ "õ": 18,
83
+ "t": 19,
84
+ "b": 20,
85
+ "!": 21,
86
+ "M": 22,
87
+ "_": 23,
88
+ "Ö": 24,
89
+ "£": 25,
90
+ "c": 26,
91
+ "T": 27,
92
+ "n": 28,
93
+ "C": 29,
94
+ "Â": 30,
95
+ "ö": 31,
96
+ "ɡ": 32,
97
+ "u": 33,
98
+ "7": 34,
99
+ "ˌ": 35,
100
+ "′": 36,
101
+ "松": 37,
102
+ "石": 38,
103
+ "é": 39,
104
+ "X": 40,
105
+ ",": 41,
106
+ "$": 42,
107
+ "ú": 43,
108
+ "‑": 44,
109
+ "ü": 45,
110
+ "s": 46,
111
+ "ô": 47,
112
+ "[": 48,
113
+ "?": 49,
114
+ "j": 50,
115
+ "ə": 51,
116
+ "ʤ": 52,
117
+ "ñ": 53,
118
+ "B": 54,
119
+ "å": 55,
120
+ "ř": 56,
121
+ "G": 57,
122
+ "I": 58,
123
+ "o": 59,
124
+ "0": 60,
125
+ "^": 61,
126
+ "‚": 62,
127
+ ".": 63,
128
+ "6": 64,
129
+ "8": 65,
130
+ "Ћ": 66,
131
+ "/": 67,
132
+ "+": 68,
133
+ "—": 69,
134
+ "ù": 70,
135
+ "g": 71,
136
+ "–": 72,
137
+ "=": 73,
138
+ "°": 74,
139
+ "Ł": 75,
140
+ "″": 76,
141
+ "“": 77,
142
+ "‘": 78,
143
+ "R": 79,
144
+ "’": 80,
145
+ "½": 81,
146
+ ">": 82,
147
+ "v": 83,
148
+ "紅": 84,
149
+ "r": 85,
150
+ "‪": 86,
151
+ "V": 87,
152
+ "ó": 88,
153
+ "a": 89,
154
+ "#": 90,
155
+ "Y": 91,
156
+ "Z": 92,
157
+ "¬": 93,
158
+ "ø": 94,
159
+ "y": 95,
160
+ "‎": 96,
161
+ "ɹ": 97,
162
+ "(": 98,
163
+ "…": 99,
164
+ "ᵊ": 100,
165
+ "π": 101,
166
+ "]": 102,
167
+ "d": 103,
168
+ "ç": 104,
169
+ "2": 105,
170
+ "L": 106,
171
+ "ɕ": 107,
172
+ "á": 108,
173
+ "D": 109,
174
+ "à": 110,
175
+ "­": 111,
176
+ "m": 112,
177
+ "€": 113,
178
+ "˚": 114,
179
+ "ã": 115,
180
+ "ä": 116,
181
+ "z": 117,
182
+ "S": 118,
183
+ "ɑ": 119,
184
+ "”": 120,
185
+ "E": 121,
186
+ "ʧ": 122,
187
+ "l": 123,
188
+ "ð": 124,
189
+ "Ã": 125,
190
+ "'": 126,
191
+ "ê": 127,
192
+ "H": 128,
193
+ "Á": 129,
194
+ "-": 130,
195
+ "U": 131,
196
+ "綠": 132,
197
+ "e": 133,
198
+ "N": 134,
199
+ "1": 135,
200
+ "ɐ": 136,
201
+ "鮭": 137,
202
+ "3": 138,
203
+ ")": 139,
204
+ "è": 140,
205
+ "í": 141,
206
+ "藍": 142,
207
+ "i": 143,
208
+ "5": 144,
209
+ "A": 145,
210
+ "ɪ": 146,
211
+ "%": 147,
212
+ "ʃ": 148,
213
+ "9": 149,
214
+ "w": 150,
215
+ "ʒ": 151,
216
+ "ë": 152,
217
+ "ˈ": 153,
218
+ "k": 154,
219
+ "O": 155,
220
+ "W": 156,
221
+ "ɛ": 157,
222
+ "Ä": 158,
223
+ "P": 159,
224
+ "p": 160,
225
+ "ŋ": 161,
226
+ "¥": 162,
227
+ "ʌ": 163,
228
+ "™": 164,
229
+ "K": 165,
230
+ "q": 166,
231
+ "×": 167,
232
+ "`": 168,
233
+ "θ": 169,
234
+ "ʊ": 170,
235
+ "x": 171,
236
+ "|": 172,
237
+ "â": 173,
238
+ "æ": 174,
239
+ ":": 175,
240
+ "4": 176,
241
+ "&": 177,
242
+ ";": 178,
243
+ "​": 179,
244
+ " ": 180,
245
+ "ɜ": 181
246
+ },
247
+ "unk_token": "<unk>"
248
+ }
249
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<unk>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "<s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "</s>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "bos_token": "<s>",
37
+ "clean_up_tokenization_spaces": true,
38
+ "eos_token": "</s>",
39
+ "model_max_length": 1000000000000000019884624838656,
40
+ "pad_token": "<pad>",
41
+ "tokenizer_class": "PreTrainedTokenizerFast",
42
+ "unk_token": "<unk>"
43
+ }
w2p_bart.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28e42968b3c2f6b5e2826c7b5c05195ae35bb242824f3908210ee4b07a7f5608
3
+ size 42555254