Spaces:

alakxender
/

dhivehi-tokenizers

Running

4641f88 3 months ago

7.82 kB

	import gradio as gr
	from transformers import AutoTokenizer, T5Tokenizer
	import asyncio
	import threading
	from concurrent.futures import ThreadPoolExecutor
	import time

	# Fixed list of custom tokenizers (left)
	TOKENIZER_CUSTOM = {
	"T5 Extended": "alakxender/dhivehi-T5-tokenizer-extended",
	"RoBERTa Extended": "alakxender/dhivehi-roberta-tokenizer-extended",
	"Google mT5": "google/mt5-base",
	"Google mT5 Extended": "alakxender/mt5-dhivehi-tokenizer-extended",
	"DeBERTa Extended": "alakxender/deberta-dhivehi-tokenizer-extended",
	"XLM-RoBERTa Extended": "alakxender/xlmr-dhivehi-tokenizer-extended",
	"Bert Extended": "alakxender/bert-dhivehi-tokenizer-extended",
	"Bert Extended Fast": "alakxender/bert-fast-dhivehi-tokenizer-extended"
	}

	# Suggested stock model paths for the right input
	SUGGESTED_STOCK_PATHS = [
	"google/flan-t5-base",
	"t5-small",
	"t5-base",
	"t5-large",
	"google/mt5-base",
	"microsoft/trocr-base-handwritten",
	"microsoft/trocr-base-printed",
	"microsoft/deberta-v3-base"
	"xlm-roberta-base",
	"naver-clova-ix/donut-base",
	"bert-base-multilingual-cased"
	]

	# Cache for loaded tokenizers to avoid reloading
	tokenizer_cache = {}

	# Load tokenizer with fallback to slow T5
	def load_tokenizer(tokenizer_path):
	if tokenizer_path in tokenizer_cache:
	return tokenizer_cache[tokenizer_path]

	try:
	tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
	tokenizer_cache[tokenizer_path] = tokenizer
	return tokenizer
	except Exception:
	if "t5" in tokenizer_path.lower() or "mt5" in tokenizer_path.lower():
	tokenizer = T5Tokenizer.from_pretrained(tokenizer_path)
	tokenizer_cache[tokenizer_path] = tokenizer
	return tokenizer
	raise

	# Tokenize and decode with enhanced visualization
	def tokenize_display(text, tokenizer_path):
	try:
	tokenizer = load_tokenizer(tokenizer_path)
	encoding = tokenizer(text, return_offsets_mapping=False, add_special_tokens=True)
	tokens = tokenizer.convert_ids_to_tokens(encoding.input_ids)
	ids = encoding.input_ids
	decoded = tokenizer.decode(ids, skip_special_tokens=False)
	return tokens, ids, decoded
	except Exception as e:
	return [f"[ERROR] {str(e)}"], [], "[Tokenizer Error]"

	def create_token_visualization(tokens, ids):
	"""Create a visual representation of tokens with colors and spacing"""
	if not tokens or not ids:
	return "❌ No tokens to display"

	# Create colored token blocks
	token_blocks = []
	colors = ["🟦", "🟩", "🟨", "🟪", "🟧", "🟫"]

	for i, (token, token_id) in enumerate(zip(tokens, ids)):
	color = colors[i % len(colors)]
	# Clean token display (remove special characters for better readability)
	clean_token = token.replace('▁', '_').replace('</s>', '[END]').replace('<s>', '[START]')
	token_blocks.append(f"{color} `{clean_token}` ({token_id})")

	return " ".join(token_blocks)

	# Async comparison with progress updates
	def compare_side_by_side_with_progress(dv_text, en_text, custom_label, stock_path, progress=gr.Progress()):
	def format_block(title, tokenizer_path):
	dv_tokens, dv_ids, dv_decoded = tokenize_display(dv_text, tokenizer_path)
	en_tokens, en_ids, en_decoded = tokenize_display(en_text, tokenizer_path)

	return f"""\
	## 🔤 {title}

	### 🈁 Dhivehi: `{dv_text}`

	🎯 Tokens: {len(dv_tokens) if dv_ids else 'N/A'} tokens
	{create_token_visualization(dv_tokens, dv_ids)}

	🔢 Token IDs: `{dv_ids if dv_ids else '[ERROR]'}`
	🔄 Decoded: `{dv_decoded}`

	---

	### 🇬🇧 English: `{en_text}`

	🎯 Tokens: {len(en_tokens) if en_ids else 'N/A'} tokens
	{create_token_visualization(en_tokens, en_ids)}

	🔢 Token IDs: `{en_ids if en_ids else '[ERROR]'}`
	🔄 Decoded: `{en_decoded}`

	---
	"""

	try:
	custom_path = TOKENIZER_CUSTOM[custom_label]
	except KeyError:
	return "[ERROR] Invalid custom tokenizer selected", ""

	# Show loading progress
	progress(0.1, desc="Loading custom tokenizer...")

	# Load custom tokenizer
	try:
	custom_result = format_block("Custom Tokenizer", custom_path)
	progress(0.5, desc="Custom tokenizer loaded. Loading stock tokenizer...")
	except Exception as e:
	custom_result = f"[ERROR] Failed to load custom tokenizer: {str(e)}"
	progress(0.5, desc="Custom tokenizer failed. Loading stock tokenizer...")

	# Load stock tokenizer
	try:
	stock_result = format_block("Stock Tokenizer", stock_path)
	progress(1.0, desc="Complete!")
	except Exception as e:
	stock_result = f"[ERROR] Failed to load stock tokenizer: {str(e)}"
	progress(1.0, desc="Complete with errors!")

	return custom_result, stock_result

	# Non-blocking comparison function
	def compare_tokenizers_async(dv_text, en_text, custom_label, stock_path):
	# Return immediate loading message
	loading_msg = """
	## ⏳ Loading Tokenizer...

	🚀 Status: Downloading and initializing tokenizer...

	This may take a moment for first-time downloads
	"""

	# Use ThreadPoolExecutor for non-blocking execution
	with ThreadPoolExecutor(max_workers=2) as executor:
	future = executor.submit(compare_side_by_side_with_progress, dv_text, en_text, custom_label, stock_path)

	# Return loading state first
	yield loading_msg, loading_msg

	# Then return actual results
	try:
	custom_result, stock_result = future.result(timeout=120) # 2 minute timeout
	yield custom_result, stock_result
	except Exception as e:
	error_msg = f"## ❌ Error\n\nFailed to load tokenizers: {str(e)}"
	yield error_msg, error_msg

	# Gradio UI with better UX
	with gr.Blocks(title="Dhivehi Tokenizer Comparison Tool", theme=gr.themes.Soft()) as demo:
	gr.Markdown("## 🧠 Dhivehi Tokenizer Comparison")
	gr.Markdown("Compare how different tokenizers process Dhivehi and English input text.")

	with gr.Row():
	dhivehi_text = gr.Textbox(
	label="Dhivehi Text",
	lines=2,
	value="އީދުގެ ހަރަކާތްތައް ފެށުމަށް މިރޭ ހުޅުމާލޭގައި އީދު މަޅި ރޯކުރަނީ",
	rtl=True,
	placeholder="Enter Dhivehi text here..."
	)
	english_text = gr.Textbox(
	label="English Text",
	lines=2,
	value="The quick brown fox jumps over the lazy dog",
	placeholder="Enter English text here..."
	)

	with gr.Row():
	tokenizer_a = gr.Dropdown(
	label="Select Custom Tokenizer",
	choices=list(TOKENIZER_CUSTOM.keys()),
	value="T5 Extended",
	info="Pre-trained Dhivehi tokenizers (or paste a path)"
	)
	tokenizer_b = gr.Dropdown(
	label="Enter or Select Stock Tokenizer Path",
	choices=SUGGESTED_STOCK_PATHS,
	value="google/flan-t5-base",
	allow_custom_value=True,
	info="Standard HuggingFace tokenizers (or paste a path)"
	)

	compare_button = gr.Button("🔄 Compare Tokenizers", variant="primary", size="lg")

	with gr.Row():
	output_custom = gr.Markdown(label="Custom Tokenizer Output", height=400)
	output_stock = gr.Markdown(label="Stock Tokenizer Output", height=400)

	# Use the non-blocking function
	compare_button.click(
	compare_side_by_side_with_progress,
	inputs=[dhivehi_text, english_text, tokenizer_a, tokenizer_b],
	outputs=[output_custom, output_stock],
	show_progress=True
	)



	if __name__ == "__main__":
	demo.launch()