initial files

Browse files

Files changed (4) hide show

README.md +70 -0
tokenizer.json +249 -0
tokenizer_config.json +43 -0
w2p_bart.onnx +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,70 @@

+### How to load the model and run inferences
+Download all the files to a local directory ```model_dir```
+#### Initiate ONNX Session
+```python
+from torch.onnx import export
+import onnxruntime as ort
+from transformers import PreTrainedTokenizerFast
+session = ort.InferenceSession("w2p_bart.onnx")
+tokenizer = PreTrainedTokenizerFast.from_pretrained(model_dir)
+bos_token_id = tokenizer.bos_token_id
+eos_token_id = tokenizer.eos_token_id
+```
+#### Batch Inference
+```python
+def g2p_onnx_batch(text, max_len=16):
+    # 1. Preprocess: tokenize and space-out characters
+    words = text.strip().split()
+    spaced_words = [" ".join(list(word)) for word in words]
+    encoded = tokenizer(spaced_words, return_tensors="np", padding=True, truncation=True, max_length=32)
+    input_ids = encoded["input_ids"]
+    attention_mask = encoded["attention_mask"]
+    batch_size = input_ids.shape[0]
+    decoder_input_ids = np.full((batch_size, 1), bos_token_id, dtype=np.int64)
+    finished = np.zeros(batch_size, dtype=bool)
+    for _ in range(max_len):
+        ort_inputs = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "decoder_input_ids": decoder_input_ids
+        }
+        logits = session.run(["logits"], ort_inputs)[0]
+        next_token_logits = logits[:, -1, :]
+        next_token_ids = np.argmax(next_token_logits, axis=-1)
+        decoder_input_ids = np.concatenate([decoder_input_ids, next_token_ids[:, None]], axis=1)
+        finished |= (next_token_ids == eos_token_id)
+        if finished.all():
+            break
+    decoded = tokenizer.batch_decode(decoder_input_ids, skip_special_tokens=True)
+    phonemes = [r.replace(" ", "") for r in decoded]
+    return " ".join(phonemes)
+```
+Example:
+```python
+result = g2p_onnx_batch("banana apple question")
+print(result)
+```
+This should return:
+```python
+bənˈænə ˈæpᵊl kwˈɛsʧᵊn
+```

tokenizer.json ADDED Viewed

	@@ -0,0 +1,249 @@

+{
+  "version": "1.0",
+  "truncation": {
+    "direction": "Right",
+    "max_length": 32,
+    "strategy": "LongestFirst",
+    "stride": 0
+  },
+  "padding": {
+    "strategy": "BatchLongest",
+    "direction": "Right",
+    "pad_to_multiple_of": null,
+    "pad_id": 1,
+    "pad_type_id": 0,
+    "pad_token": "<pad>"
+  },
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 1,
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 2,
+      "content": "<s>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 3,
+      "content": "</s>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
+  "normalizer": null,
+  "pre_tokenizer": {
+    "type": "Whitespace"
+  },
+  "post_processor": null,
+  "decoder": null,
+  "model": {
+    "type": "WordLevel",
+    "vocab": {
+      "<unk>": 0,
+      "<pad>": 1,
+      "<s>": 2,
+      "</s>": 3,
+      "Ó": 4,
+      "ł": 5,
+      "ɔ": 6,
+      "ᵻ": 7,
+      "Q": 8,
+      "h": 9,
+      "F": 10,
+      "f": 11,
+      "J": 12,
+      "¹": 13,
+      "*": 14,
+      "\"": 15,
+      "@": 16,
+      "©": 17,
+      "õ": 18,
+      "t": 19,
+      "b": 20,
+      "!": 21,
+      "M": 22,
+      "_": 23,
+      "Ö": 24,
+      "£": 25,
+      "c": 26,
+      "T": 27,
+      "n": 28,
+      "C": 29,
+      "Â": 30,
+      "ö": 31,
+      "ɡ": 32,
+      "u": 33,
+      "7": 34,
+      "ˌ": 35,
+      "′": 36,
+      "松": 37,
+      "石": 38,
+      "é": 39,
+      "X": 40,
+      ",": 41,
+      "$": 42,
+      "ú": 43,
+      "‑": 44,
+      "ü": 45,
+      "s": 46,
+      "ô": 47,
+      "[": 48,
+      "?": 49,
+      "j": 50,
+      "ə": 51,
+      "ʤ": 52,
+      "ñ": 53,
+      "B": 54,
+      "å": 55,
+      "ř": 56,
+      "G": 57,
+      "I": 58,
+      "o": 59,
+      "0": 60,
+      "^": 61,
+      "‚": 62,
+      ".": 63,
+      "6": 64,
+      "8": 65,
+      "Ћ": 66,
+      "/": 67,
+      "+": 68,
+      "—": 69,
+      "ù": 70,
+      "g": 71,
+      "–": 72,
+      "=": 73,
+      "°": 74,
+      "Ł": 75,
+      "″": 76,
+      "“": 77,
+      "‘": 78,
+      "R": 79,
+      "’": 80,
+      "½": 81,
+      ">": 82,
+      "v": 83,
+      "紅": 84,
+      "r": 85,
+      "‪": 86,
+      "V": 87,
+      "ó": 88,
+      "a": 89,
+      "#": 90,
+      "Y": 91,
+      "Z": 92,
+      "¬": 93,
+      "ø": 94,
+      "y": 95,
+      "‎": 96,
+      "ɹ": 97,
+      "(": 98,
+      "…": 99,
+      "ᵊ": 100,
+      "π": 101,
+      "]": 102,
+      "d": 103,
+      "ç": 104,
+      "2": 105,
+      "L": 106,
+      "ɕ": 107,
+      "á": 108,
+      "D": 109,
+      "à": 110,
+      "": 111,
+      "m": 112,
+      "€": 113,
+      "˚": 114,
+      "ã": 115,
+      "ä": 116,
+      "z": 117,
+      "S": 118,
+      "ɑ": 119,
+      "”": 120,
+      "E": 121,
+      "ʧ": 122,
+      "l": 123,
+      "ð": 124,
+      "Ã": 125,
+      "'": 126,
+      "ê": 127,
+      "H": 128,
+      "Á": 129,
+      "-": 130,
+      "U": 131,
+      "綠": 132,
+      "e": 133,
+      "N": 134,
+      "1": 135,
+      "ɐ": 136,
+      "鮭": 137,
+      "3": 138,
+      ")": 139,
+      "è": 140,
+      "í": 141,
+      "藍": 142,
+      "i": 143,
+      "5": 144,
+      "A": 145,
+      "ɪ": 146,
+      "%": 147,
+      "ʃ": 148,
+      "9": 149,
+      "w": 150,
+      "ʒ": 151,
+      "ë": 152,
+      "ˈ": 153,
+      "k": 154,
+      "O": 155,
+      "W": 156,
+      "ɛ": 157,
+      "Ä": 158,
+      "P": 159,
+      "p": 160,
+      "ŋ": 161,
+      "¥": 162,
+      "ʌ": 163,
+      "™": 164,
+      "K": 165,
+      "q": 166,
+      "×": 167,
+      "`": 168,
+      "θ": 169,
+      "ʊ": 170,
+      "x": 171,
+      "|": 172,
+      "â": 173,
+      "æ": 174,
+      ":": 175,
+      "4": 176,
+      "&": 177,
+      ";": 178,
+      "": 179,
+      " ": 180,
+      "ɜ": 181
+    },
+    "unk_token": "<unk>"
+  }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "</s>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": "<unk>"
+}

w2p_bart.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:28e42968b3c2f6b5e2826c7b5c05195ae35bb242824f3908210ee4b07a7f5608
+size 42555254