initial files

Browse files

Files changed (10) hide show

.DS_Store +0 -0
README.md +98 -0
custom_bart.onnx +3 -0
input_tokenizer/merges.txt +0 -0
input_tokenizer/special_tokens_map.json +51 -0
input_tokenizer/tokenizer_config.json +56 -0
input_tokenizer/vocab.json +0 -0
output_tokenizer/special_tokens_map.json +30 -0
output_tokenizer/tokenizer.json +128 -0
output_tokenizer/tokenizer_config.json +52 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

README.md ADDED Viewed

	@@ -0,0 +1,98 @@

+### How to load the model and make inferences
+Download all the files to a local directory ```model_dir```
+#### Initiate ONNX Session
+```python
+from torch.onnx import export
+import onnxruntime as ort
+session = ort.InferenceSession(model_dir + "/custom_bart.onnx")
+```
+#### Load Tokenizers
+```python
+input_tokenizer = BartTokenizer.from_pretrained(model_dir + "/input_tokenizer")
+output_tokenizer= PreTrainedTokenizerFast.from_pretrained(model_dir + "/output_tokenizer")
+```
+Set up special tokens
+```python
+bos_token_id = output_tokenizer.bos_token_id
+eos_token_id = output_tokenizer.eos_token_id
+pad_token_id = output_tokenizer.pad_token_id
+```
+#### Inference
+```python
+# add custom decoding logic
+import re
+def remove_intra_word_spaces(text):
+    # Remove special tokens first (optional, if needed)
+    text = text.replace("<s>", "").replace("</s>", "").strip()
+    # Step 1: Split on 2+ spaces (which indicate word boundaries)
+    words = re.split(r'\s{2,}', text)
+    # Step 2: For each word, remove all single spaces (intra-word spacing)
+    cleaned_words = [''.join(word.split()) for word in words]
+    # Step 3: Join words back with a single space
+    return ' '.join(cleaned_words)
+```
+Custom inference function from the onnx session
+```python
+def greedy_decode_onnx_full_model(input_text, max_length = 512, input_length = 128):
+    # Encode input
+    inputs = input_tokenizer(input_text, return_tensors="np", padding=True, truncation=True, max_length=input_length)
+    input_ids = inputs["input_ids"]
+    attention_mask = inputs["attention_mask"]
+    # Initialize decoder with BOS
+    decoder_input_ids = np.array([[bos_token_id]], dtype=np.int64)
+    for _ in range(max_length):
+        # Run ONNX forward
+        ort_inputs = {
+            "input_ids": input_ids.astype(np.int64),
+            "attention_mask": attention_mask.astype(np.int64),
+            "decoder_input_ids": decoder_input_ids.astype(np.int64),
+        }
+        logits = session.run(["logits"], ort_inputs)[0]
+        next_token_logits = logits[:, -1, :]  # (batch, vocab)
+        next_token_id = np.argmax(next_token_logits, axis=-1).reshape(1, 1)  # (1, 1)
+        # Append new token to decoder input
+        decoder_input_ids = np.concatenate([decoder_input_ids, next_token_id], axis=-1)
+        if next_token_id[0][0] == eos_token_id:
+            break
+    # Decode final tokens
+    decoded_text = output_tokenizer.decode(decoder_input_ids[0], skip_special_tokens=False)
+    return decoded_text
+```
+Example:
+```python
+text = "This is a test."
+output = greedy_decode_onnx_full_model(text)
+cleaned = remove_intra_word_spaces(output)
+print("Raw output:", output)
+print("Cleaned:", cleaned)
+```
+This should return:
+```python
+Raw output: <s> ð ˌ ɪ s   ɪ z   ɐ   t ˈ ɛ s t . </s>
+Cleaned: ðˌɪs ɪz ɐ tˈɛst.
+```

custom_bart.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3bb274c77cbfd6e6d129c7e9203ae7e87530aca185a4cf9a840d090094958132
+size 113868115

input_tokenizer/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

input_tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

input_tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50264": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "errors": "replace",
+  "mask_token": "<mask>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "BartTokenizer",
+  "unk_token": "<unk>"
+}

input_tokenizer/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

output_tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

output_tokenizer/tokenizer.json ADDED Viewed

	@@ -0,0 +1,128 @@

+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 1,
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 2,
+      "content": "<s>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 3,
+      "content": "</s>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 4,
+      "content": " ",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
+  "normalizer": null,
+  "pre_tokenizer": {
+    "type": "Whitespace"
+  },
+  "post_processor": null,
+  "decoder": null,
+  "model": {
+    "type": "WordLevel",
+    "vocab": {
+      "<unk>": 0,
+      "<pad>": 1,
+      "<s>": 2,
+      "</s>": 3,
+      " ": 4,
+      "ˈ": 5,
+      "ə": 6,
+      "n": 7,
+      "ɹ": 8,
+      "t": 9,
+      "ɪ": 10,
+      "s": 11,
+      "l": 12,
+      "d": 13,
+      "k": 14,
+      "i": 15,
+      "ˌ": 16,
+      "ɛ": 17,
+      "z": 18,
+      "æ": 19,
+      "m": 20,
+      "p": 21,
+      "v": 22,
+      "ð": 23,
+      "f": 24,
+      "ʌ": 25,
+      "A": 26,
+      "w": 27,
+      "ɔ": 28,
+      "ᵊ": 29,
+      "I": 30,
+      "ŋ": 31,
+      ",": 32,
+      "ɑ": 33,
+      "b": 34,
+      "ʃ": 35,
+      "T": 36,
+      "u": 37,
+      "O": 38,
+      "h": 39,
+      "j": 40,
+      "ʤ": 41,
+      "ɡ": 42,
+      ".": 43,
+      "ɜ": 44,
+      "ʧ": 45,
+      "ɐ": 46,
+      "ᵻ": 47,
+      "W": 48,
+      "ʊ": 49,
+      "θ": 50,
+      ";": 51,
+      "Y": 52,
+      "ʒ": 53,
+      "”": 54,
+      "“": 55,
+      ")": 56,
+      "(": 57,
+      "?": 58,
+      "—": 59,
+      ":": 60,
+      "!": 61,
+      "[": 62,
+      "]": 63
+    },
+    "unk_token": "<unk>"
+  }
+}

output_tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,52 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": " ",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": "<unk>"
+}