Commit
·
43a6ebc
1
Parent(s):
2cd2c96
Removed the training option from the HuggingFace. Reading the pre-trained vocab
Browse files- app.py +21 -15
- data/paired_tokens.bpe +1 -2
- data/vocab.bpe +0 -0
- playground.ipynb +0 -0
- tests/test_tokenizer.py +1 -1
- tokenizer.py +18 -0
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import gradio as gr
|
2 |
-
from tokenizer import sanskrit_token_preprocessor, bpeAlgo, create_vocab, save_paired_tokens_vocab, save_vocab, encode, decode
|
3 |
|
4 |
# Created a `TokenizerModel` class to maintain state between calls
|
5 |
# - Added a web interface with separate tabs for training and tokenization
|
@@ -52,7 +52,13 @@ class TokenizerModel:
|
|
52 |
# return {"Error":f"Please train the tokenizer first or ensure vocabulary files exist! {se}"}
|
53 |
# except Exception as e:
|
54 |
# return {"Error": str(e)}
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
encoded = encode(text, self.paired_tokens_vocab)
|
58 |
decoded = decode(encoded, self.vocab)
|
@@ -77,20 +83,20 @@ with gr.Blocks(title="Sanskrit BPE Tokenizer") as demo:
|
|
77 |
gr.Markdown("# Sanskrit BPE Tokenizer")
|
78 |
gr.Markdown("This tokenizer implements Byte-Pair Encoding (BPE) for Sanskrit text.")
|
79 |
|
80 |
-
with gr.Tab("Train"):
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
|
95 |
with gr.Tab("Tokenize"):
|
96 |
text_input = gr.Textbox(
|
|
|
1 |
import gradio as gr
|
2 |
+
from tokenizer import sanskrit_token_preprocessor, bpeAlgo, create_vocab, save_paired_tokens_vocab, save_vocab, encode, decode, load_paired_tokens_vocab, load_vocab
|
3 |
|
4 |
# Created a `TokenizerModel` class to maintain state between calls
|
5 |
# - Added a web interface with separate tabs for training and tokenization
|
|
|
52 |
# return {"Error":f"Please train the tokenizer first or ensure vocabulary files exist! {se}"}
|
53 |
# except Exception as e:
|
54 |
# return {"Error": str(e)}
|
55 |
+
try:
|
56 |
+
self.paired_tokens_vocab = load_paired_tokens_vocab("data/paired_tokens.bpe")
|
57 |
+
self.vocab = load_vocab("data/vocab.bpe")
|
58 |
+
except Exception as e:
|
59 |
+
print(e)
|
60 |
+
return {"Error": str(e)}
|
61 |
+
# return {"Error": "Please train it first to create vocab. Training smaples can be found in the data dir"}
|
62 |
|
63 |
encoded = encode(text, self.paired_tokens_vocab)
|
64 |
decoded = decode(encoded, self.vocab)
|
|
|
83 |
gr.Markdown("# Sanskrit BPE Tokenizer")
|
84 |
gr.Markdown("This tokenizer implements Byte-Pair Encoding (BPE) for Sanskrit text.")
|
85 |
|
86 |
+
# with gr.Tab("Train"):
|
87 |
+
# train_input = gr.Textbox(
|
88 |
+
# label="Training Text",
|
89 |
+
# placeholder="Enter Sanskrit text for training...",
|
90 |
+
# lines=5
|
91 |
+
# )
|
92 |
+
# train_button = gr.Button("Train Tokenizer")
|
93 |
+
# train_output = gr.Textbox(label="Training Result")
|
94 |
|
95 |
+
# train_button.click(
|
96 |
+
# train_tokenizer,
|
97 |
+
# inputs=train_input,
|
98 |
+
# outputs=train_output
|
99 |
+
# )
|
100 |
|
101 |
with gr.Tab("Tokenize"):
|
102 |
text_input = gr.Textbox(
|
data/paired_tokens.bpe
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
Token version 1
|
2 |
(224, 164):256
|
3 |
(224, 165):257
|
4 |
(257, 141):258
|
@@ -4742,4 +4741,4 @@ Token version 1
|
|
4742 |
(384, 442):4996
|
4743 |
(558, 4159):4997
|
4744 |
(1766, 360):4998
|
4745 |
-
(973, 334):4999
|
|
|
|
|
1 |
(224, 164):256
|
2 |
(224, 165):257
|
3 |
(257, 141):258
|
|
|
4741 |
(384, 442):4996
|
4742 |
(558, 4159):4997
|
4743 |
(1766, 360):4998
|
4744 |
+
(973, 334):4999
|
data/vocab.bpe
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
playground.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
tests/test_tokenizer.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import pytest
|
2 |
-
from
|
3 |
|
4 |
@pytest.fixture
|
5 |
def sample_text():
|
|
|
1 |
import pytest
|
2 |
+
from tokenizer import sanskrit_token_preprocessor, bpeAlgo, create_vocab, encode, decode
|
3 |
|
4 |
@pytest.fixture
|
5 |
def sample_text():
|
tokenizer.py
CHANGED
@@ -102,6 +102,24 @@ def save_vocab(filepath, vocab):
|
|
102 |
f.write('Token version 1\n')
|
103 |
for k,v in vocab.items():
|
104 |
f.write(f"{k}:{v}\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
|
107 |
|
|
|
102 |
f.write('Token version 1\n')
|
103 |
for k,v in vocab.items():
|
104 |
f.write(f"{k}:{v}\n")
|
105 |
+
|
106 |
+
def load_paired_tokens_vocab(filepath):
|
107 |
+
paired_tokens = {}
|
108 |
+
with open(filepath, 'r') as f:
|
109 |
+
for line in f:
|
110 |
+
[k, v] = line.split(":")
|
111 |
+
paired_tokens[k] = v
|
112 |
+
return paired_tokens
|
113 |
+
|
114 |
+
def load_vocab(filepath):
|
115 |
+
vocab = {}
|
116 |
+
with open(filepath, 'r') as f:
|
117 |
+
for line in f:
|
118 |
+
[k, v] = line.split("<::>")
|
119 |
+
vocab[int(k)] = eval(v)
|
120 |
+
return vocab
|
121 |
+
|
122 |
+
|
123 |
|
124 |
|
125 |
|