gitesh-grover's picture
Removed the training option from the HuggingFace. Reading the pre-trained vocab
43a6ebc
import gradio as gr
from tokenizer import sanskrit_token_preprocessor, bpeAlgo, create_vocab, save_paired_tokens_vocab, save_vocab, encode, decode, load_paired_tokens_vocab, load_vocab
# Created a `TokenizerModel` class to maintain state between calls
# - Added a web interface with separate tabs for training and tokenization
# - Included example texts and documentation
# - Made the interface more user-friendly with clear labels and instructions
# - Added JSON output for better visualization of results
# The app provides two main functionalities:
# 1. Training the tokenizer on custom text
# 2. Tokenizing new text using the trained model
class TokenizerModel:
def __init__(self, vocab_size=5000):
self.vocab_size = vocab_size
self.paired_tokens_vocab = None
self.vocab = None
def train(self, text):
# Preprocess tokens
tokens = sanskrit_token_preprocessor(text)
tokens = [list(tok.encode('utf-8')) for tok in tokens]
orig_tokens_len = len([tok for toks in tokens for tok in toks])
print(type(orig_tokens_len))
# Train BPE
try:
self.paired_tokens_vocab, encoded_tokens = bpeAlgo(tokens, self.vocab_size - 256, 256)
self.vocab = create_vocab(self.paired_tokens_vocab)
except ValueError as e:
# Display the error message in your UI
return f"Error during training: {str(e)}"
# save_paired_tokens_vocab("data/paired_tokens.bpe", self.paired_tokens_vocab)
# save_vocab( "data/vocab.bpe", self.vocab)
# Calculate compression ratio
compression_ratio = orig_tokens_len / len(encoded_tokens)
return f"Training completed! Vocabulary size: {len(self.vocab)} and Compression Ratio: {compression_ratio:.2f}"
def tokenize(self, text):
if self.paired_tokens_vocab is None:
# try:
# with open("data/paired_tokens.bpe", "r") as f:
# self.paired_tokens_vocab = eval(f.read())
# with open("data/vocab.bpe", "r") as f:
# self.vocab = eval(f.read())
# except FileNotFoundError as fe:
# return {"Error":f"Please train the tokenizer first or ensure vocabulary files exist! {fe}"}
# except SyntaxError as se:
# return {"Error":f"Please train the tokenizer first or ensure vocabulary files exist! {se}"}
# except Exception as e:
# return {"Error": str(e)}
try:
self.paired_tokens_vocab = load_paired_tokens_vocab("data/paired_tokens.bpe")
self.vocab = load_vocab("data/vocab.bpe")
except Exception as e:
print(e)
return {"Error": str(e)}
# return {"Error": "Please train it first to create vocab. Training smaples can be found in the data dir"}
encoded = encode(text, self.paired_tokens_vocab)
decoded = decode(encoded, self.vocab)
return {
"Encoded tokens": encoded,
"Decoded text": decoded,
"Matches original": decoded == text
}
# Create global tokenizer instance
tokenizer = TokenizerModel()
def train_tokenizer(text):
return tokenizer.train(text)
def process_text(text):
return tokenizer.tokenize(text)
# Create the Gradio interface
with gr.Blocks(title="Sanskrit BPE Tokenizer") as demo:
gr.Markdown("# Sanskrit BPE Tokenizer")
gr.Markdown("This tokenizer implements Byte-Pair Encoding (BPE) for Sanskrit text.")
# with gr.Tab("Train"):
# train_input = gr.Textbox(
# label="Training Text",
# placeholder="Enter Sanskrit text for training...",
# lines=5
# )
# train_button = gr.Button("Train Tokenizer")
# train_output = gr.Textbox(label="Training Result")
# train_button.click(
# train_tokenizer,
# inputs=train_input,
# outputs=train_output
# )
with gr.Tab("Tokenize"):
text_input = gr.Textbox(
label="Input Text",
placeholder="Enter Sanskrit text to tokenize...",
lines=3
)
tokenize_button = gr.Button("Tokenize")
result_output = gr.JSON(label="Results")
tokenize_button.click(
process_text,
inputs=text_input,
outputs=result_output
)
gr.Markdown("""
### Example texts:
```
चीराण्यपास्याज्जनकस्य कन्या नेयं प्रतिज्ञा मम दत्तपूर्वा।
यथासुखं गच्छतु राजपुत्री वनं समग्रा सह सर्वरत्नैः॥
```
""")
if __name__ == "__main__":
demo.launch()