zijuncheng commited on
Commit
6388622
·
unverified ·
1 Parent(s): 934208b

initial files

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
README.md ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### How to load the model and make inferences
2
+
3
+ Download all the files to a local directory ```model_dir```
4
+
5
+ #### Initiate ONNX Session
6
+
7
+ ```python
8
+ from torch.onnx import export
9
+ import onnxruntime as ort
10
+
11
+ session = ort.InferenceSession(model_dir + "/custom_bart.onnx")
12
+
13
+ ```
14
+
15
+ #### Load Tokenizers
16
+
17
+ ```python
18
+ input_tokenizer = BartTokenizer.from_pretrained(model_dir + "/input_tokenizer")
19
+ output_tokenizer= PreTrainedTokenizerFast.from_pretrained(model_dir + "/output_tokenizer")
20
+ ```
21
+ Set up special tokens
22
+
23
+ ```python
24
+ bos_token_id = output_tokenizer.bos_token_id
25
+ eos_token_id = output_tokenizer.eos_token_id
26
+ pad_token_id = output_tokenizer.pad_token_id
27
+ ```
28
+
29
+ #### Inference
30
+
31
+ ```python
32
+ # add custom decoding logic
33
+ import re
34
+ def remove_intra_word_spaces(text):
35
+ # Remove special tokens first (optional, if needed)
36
+ text = text.replace("<s>", "").replace("</s>", "").strip()
37
+
38
+ # Step 1: Split on 2+ spaces (which indicate word boundaries)
39
+ words = re.split(r'\s{2,}', text)
40
+
41
+ # Step 2: For each word, remove all single spaces (intra-word spacing)
42
+ cleaned_words = [''.join(word.split()) for word in words]
43
+
44
+ # Step 3: Join words back with a single space
45
+ return ' '.join(cleaned_words)
46
+
47
+ ```
48
+ Custom inference function from the onnx session
49
+
50
+ ```python
51
+ def greedy_decode_onnx_full_model(input_text, max_length = 512, input_length = 128):
52
+ # Encode input
53
+ inputs = input_tokenizer(input_text, return_tensors="np", padding=True, truncation=True, max_length=input_length)
54
+ input_ids = inputs["input_ids"]
55
+ attention_mask = inputs["attention_mask"]
56
+
57
+ # Initialize decoder with BOS
58
+ decoder_input_ids = np.array([[bos_token_id]], dtype=np.int64)
59
+
60
+ for _ in range(max_length):
61
+ # Run ONNX forward
62
+ ort_inputs = {
63
+ "input_ids": input_ids.astype(np.int64),
64
+ "attention_mask": attention_mask.astype(np.int64),
65
+ "decoder_input_ids": decoder_input_ids.astype(np.int64),
66
+ }
67
+
68
+ logits = session.run(["logits"], ort_inputs)[0]
69
+ next_token_logits = logits[:, -1, :] # (batch, vocab)
70
+ next_token_id = np.argmax(next_token_logits, axis=-1).reshape(1, 1) # (1, 1)
71
+
72
+ # Append new token to decoder input
73
+ decoder_input_ids = np.concatenate([decoder_input_ids, next_token_id], axis=-1)
74
+
75
+ if next_token_id[0][0] == eos_token_id:
76
+ break
77
+
78
+ # Decode final tokens
79
+ decoded_text = output_tokenizer.decode(decoder_input_ids[0], skip_special_tokens=False)
80
+ return decoded_text
81
+ ```
82
+
83
+ Example:
84
+
85
+ ```python
86
+ text = "This is a test."
87
+ output = greedy_decode_onnx_full_model(text)
88
+ cleaned = remove_intra_word_spaces(output)
89
+ print("Raw output:", output)
90
+ print("Cleaned:", cleaned)
91
+ ```
92
+
93
+ This should return:
94
+
95
+ ```python
96
+ Raw output: <s> ð ˌ ɪ s ɪ z ɐ t ˈ ɛ s t . </s>
97
+ Cleaned: ðˌɪs ɪz ɐ tˈɛst.
98
+ ```
custom_bart.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bb274c77cbfd6e6d129c7e9203ae7e87530aca185a4cf9a840d090094958132
3
+ size 113868115
input_tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
input_tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
input_tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": true,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": true,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "mask_token": "<mask>",
51
+ "model_max_length": 1000000000000000019884624838656,
52
+ "pad_token": "<pad>",
53
+ "sep_token": "</s>",
54
+ "tokenizer_class": "BartTokenizer",
55
+ "unk_token": "<unk>"
56
+ }
input_tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
output_tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
output_tokenizer/tokenizer.json ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<unk>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "<pad>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "<s>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "</s>",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 4,
44
+ "content": " ",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ }
51
+ ],
52
+ "normalizer": null,
53
+ "pre_tokenizer": {
54
+ "type": "Whitespace"
55
+ },
56
+ "post_processor": null,
57
+ "decoder": null,
58
+ "model": {
59
+ "type": "WordLevel",
60
+ "vocab": {
61
+ "<unk>": 0,
62
+ "<pad>": 1,
63
+ "<s>": 2,
64
+ "</s>": 3,
65
+ " ": 4,
66
+ "ˈ": 5,
67
+ "ə": 6,
68
+ "n": 7,
69
+ "ɹ": 8,
70
+ "t": 9,
71
+ "ɪ": 10,
72
+ "s": 11,
73
+ "l": 12,
74
+ "d": 13,
75
+ "k": 14,
76
+ "i": 15,
77
+ "ˌ": 16,
78
+ "ɛ": 17,
79
+ "z": 18,
80
+ "æ": 19,
81
+ "m": 20,
82
+ "p": 21,
83
+ "v": 22,
84
+ "ð": 23,
85
+ "f": 24,
86
+ "ʌ": 25,
87
+ "A": 26,
88
+ "w": 27,
89
+ "ɔ": 28,
90
+ "ᵊ": 29,
91
+ "I": 30,
92
+ "ŋ": 31,
93
+ ",": 32,
94
+ "ɑ": 33,
95
+ "b": 34,
96
+ "ʃ": 35,
97
+ "T": 36,
98
+ "u": 37,
99
+ "O": 38,
100
+ "h": 39,
101
+ "j": 40,
102
+ "ʤ": 41,
103
+ "ɡ": 42,
104
+ ".": 43,
105
+ "ɜ": 44,
106
+ "ʧ": 45,
107
+ "ɐ": 46,
108
+ "ᵻ": 47,
109
+ "W": 48,
110
+ "ʊ": 49,
111
+ "θ": 50,
112
+ ";": 51,
113
+ "Y": 52,
114
+ "ʒ": 53,
115
+ "”": 54,
116
+ "“": 55,
117
+ ")": 56,
118
+ "(": 57,
119
+ "?": 58,
120
+ "—": 59,
121
+ ":": 60,
122
+ "!": 61,
123
+ "[": 62,
124
+ "]": 63
125
+ },
126
+ "unk_token": "<unk>"
127
+ }
128
+ }
output_tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<unk>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "<s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "</s>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": " ",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": false,
46
+ "eos_token": "</s>",
47
+ "extra_special_tokens": {},
48
+ "model_max_length": 1000000000000000019884624838656,
49
+ "pad_token": "<pad>",
50
+ "tokenizer_class": "PreTrainedTokenizerFast",
51
+ "unk_token": "<unk>"
52
+ }