Spaces:

gitesh-grover
/

sanskrit-tokenizer-demo

Sleeping

App Files Files Community

sanskrit-tokenizer-demo / app.py

gitesh-grover

Removed the training option from the HuggingFace. Reading the pre-trained vocab

43a6ebc 2 months ago

raw

history blame contribute delete

4.94 kB

	import gradio as gr
	from tokenizer import sanskrit_token_preprocessor, bpeAlgo, create_vocab, save_paired_tokens_vocab, save_vocab, encode, decode, load_paired_tokens_vocab, load_vocab

	# Created a `TokenizerModel` class to maintain state between calls
	# - Added a web interface with separate tabs for training and tokenization
	# - Included example texts and documentation
	# - Made the interface more user-friendly with clear labels and instructions
	# - Added JSON output for better visualization of results

	# The app provides two main functionalities:
	# 1. Training the tokenizer on custom text
	# 2. Tokenizing new text using the trained model
	class TokenizerModel:
	def __init__(self, vocab_size=5000):
	self.vocab_size = vocab_size
	self.paired_tokens_vocab = None
	self.vocab = None

	def train(self, text):
	# Preprocess tokens
	tokens = sanskrit_token_preprocessor(text)
	tokens = [list(tok.encode('utf-8')) for tok in tokens]
	orig_tokens_len = len([tok for toks in tokens for tok in toks])
	print(type(orig_tokens_len))

	# Train BPE
	try:
	self.paired_tokens_vocab, encoded_tokens = bpeAlgo(tokens, self.vocab_size - 256, 256)
	self.vocab = create_vocab(self.paired_tokens_vocab)
	except ValueError as e:
	# Display the error message in your UI
	return f"Error during training: {str(e)}"

	# save_paired_tokens_vocab("data/paired_tokens.bpe", self.paired_tokens_vocab)
	# save_vocab( "data/vocab.bpe", self.vocab)

	# Calculate compression ratio
	compression_ratio = orig_tokens_len / len(encoded_tokens)

	return f"Training completed! Vocabulary size: {len(self.vocab)} and Compression Ratio: {compression_ratio:.2f}"

	def tokenize(self, text):
	if self.paired_tokens_vocab is None:
	# try:
	# with open("data/paired_tokens.bpe", "r") as f:
	# self.paired_tokens_vocab = eval(f.read())
	# with open("data/vocab.bpe", "r") as f:
	# self.vocab = eval(f.read())
	# except FileNotFoundError as fe:
	# return {"Error":f"Please train the tokenizer first or ensure vocabulary files exist! {fe}"}
	# except SyntaxError as se:
	# return {"Error":f"Please train the tokenizer first or ensure vocabulary files exist! {se}"}
	# except Exception as e:
	# return {"Error": str(e)}
	try:
	self.paired_tokens_vocab = load_paired_tokens_vocab("data/paired_tokens.bpe")
	self.vocab = load_vocab("data/vocab.bpe")
	except Exception as e:
	print(e)
	return {"Error": str(e)}
	# return {"Error": "Please train it first to create vocab. Training smaples can be found in the data dir"}

	encoded = encode(text, self.paired_tokens_vocab)
	decoded = decode(encoded, self.vocab)

	return {
	"Encoded tokens": encoded,
	"Decoded text": decoded,
	"Matches original": decoded == text
	}

	# Create global tokenizer instance
	tokenizer = TokenizerModel()

	def train_tokenizer(text):
	return tokenizer.train(text)

	def process_text(text):
	return tokenizer.tokenize(text)

	# Create the Gradio interface
	with gr.Blocks(title="Sanskrit BPE Tokenizer") as demo:
	gr.Markdown("# Sanskrit BPE Tokenizer")
	gr.Markdown("This tokenizer implements Byte-Pair Encoding (BPE) for Sanskrit text.")

	# with gr.Tab("Train"):
	# train_input = gr.Textbox(
	# label="Training Text",
	# placeholder="Enter Sanskrit text for training...",
	# lines=5
	# )
	# train_button = gr.Button("Train Tokenizer")
	# train_output = gr.Textbox(label="Training Result")

	# train_button.click(
	# train_tokenizer,
	# inputs=train_input,
	# outputs=train_output
	# )

	with gr.Tab("Tokenize"):
	text_input = gr.Textbox(
	label="Input Text",
	placeholder="Enter Sanskrit text to tokenize...",
	lines=3
	)
	tokenize_button = gr.Button("Tokenize")
	result_output = gr.JSON(label="Results")

	tokenize_button.click(
	process_text,
	inputs=text_input,
	outputs=result_output
	)

	gr.Markdown("""
	### Example texts:
	```
	चीराण्यपास्याज्जनकस्य कन्या नेयं प्रतिज्ञा मम दत्तपूर्वा।
	यथासुखं गच्छतु राजपुत्री वनं समग्रा सह सर्वरत्नैः॥
	```
	""")

	if __name__ == "__main__":
	demo.launch()