peeyushsinghal's picture
included hindi tokenizer files
10f8413 verified
import gradio as gr
from tokenizer import Tokenizer, load_config
import json
import html
# Load the tokenizer
config = load_config("config_app.yml")
tokenizer = Tokenizer.load(config["tokenizer_file_path"])
tokenizer.config = config
def highlight_tokens(text: str, encoded_tokens: list) -> str:
"""
Create HTML with highlighted tokens in the text.
Args:
text (str): The original input text to be tokenized.
encoded_tokens (list): A list of encoded token IDs.
Returns:
str: HTML string with highlighted tokens and tooltips showing token IDs.
"""
decoded_tokens = []
current_pos = 0
html_text = ""
# Decode each token and create spans with different colors
for i, token in enumerate(encoded_tokens):
token_bytes = tokenizer.decode([token])
decoded_tokens.append(token_bytes)
# Find the token in the original text
token_pos = text.find(token_bytes, current_pos)
if token_pos != -1:
# Add any skipped text
if token_pos > current_pos:
html_text += html.escape(text[current_pos:token_pos])
# Add the highlighted token with improved tooltip
color = f"hsl({(i * 60) % 360}, 80%, 85%)"
html_text += f'''
<span
style="background-color: {color};
border-radius: 3px;
padding: 0 3px;
margin: 0 1px;
position: relative;
cursor: help;"
onmouseover="this.querySelector('.tooltip').style.display='block'"
onmouseout="this.querySelector('.tooltip').style.display='none'">
{html.escape(token_bytes)}
<span class="tooltip"
style="display: none;
position: absolute;
bottom: 100%;
left: 50%;
transform: translateX(-50%);
background-color: #333;
color: white;
padding: 4px 8px;
border-radius: 4px;
font-size: 12px;
white-space: nowrap;
z-index: 1000;">
Token ID: {token}
</span>
</span>'''
current_pos = token_pos + len(token_bytes)
# Add any remaining text
if current_pos < len(text):
html_text += html.escape(text[current_pos:])
return html_text
def process_text(text: str) -> tuple:
"""
Process input text through the tokenizer and return results.
Args:
text (str): The input text to be processed.
Returns:
tuple: A tuple containing:
- HTML string of highlighted tokens.
- HTML string of token statistics.
- String of token IDs.
"""
try:
# Encode the text
encoded = tokenizer.encode(text)
# Decode back to text
decoded = tokenizer.decode(encoded)
# Create token visualization
highlighted_text = highlight_tokens(text, encoded)
# Token statistics
stats = {
"Total Tokens": len(encoded),
"Unique Tokens": len(set(encoded)),
"Characters": len(text),
"Bytes": len(text.encode('utf-8')),
"Compression Ratio": f"{len(text.encode('utf-8')) / (len(encoded) * 4):.2f}x"
}
# Format statistics
stats_html = "<div style='margin-top: 20px;'>"
for key, value in stats.items():
stats_html += f"<div style='margin: 5px 0;'><b>{key}:</b> {value}</div>"
stats_html += "</div>"
return (
gr.HTML(highlighted_text),
gr.HTML(stats_html),
f"Token IDs: {encoded}"
)
except Exception as e:
return (
gr.HTML(f"<span style='color: red'>Error: {str(e)}</span>"),
"",
""
)
# Define example inputs
examples = [
["यहां वर्तमान में 20 हजार पुस्तकें थी जो अभी रैन बसेरा परिसर के कक्ष में रखी हुई है।"],
["भारत एक विशाल देश है।"],
["मैं हिंदी में बात कर रहा हूं।"],
["नमस्ते, आप कैसे हैं?"],
["दिल्ली भारत की राजधानी है।"]
]
# Custom CSS
custom_css = """
.container {
max-width: 800px;
margin: auto;
padding: 20px;
}
.token-viz {
font-family: monospace;
line-height: 1.6;
padding: 15px;
border: 1px solid #ddd;
border-radius: 5px;
background: white;
margin: 10px 0;
position: relative;
}
.stats {
background: #f7f7f7;
padding: 15px;
border-radius: 5px;
margin: 10px 0;
}
.token-ids {
font-family: monospace;
padding: 15px;
background: #f0f0f0;
border-radius: 5px;
overflow-wrap: break-word;
}
.tooltip {
pointer-events: none;
box-shadow: 0 2px 4px rgba(0,0,0,0.2);
}
"""
# Create the Gradio interface
iface = gr.Interface(
fn=process_text,
inputs=[
gr.Textbox(
label="Input Text",
placeholder="Enter Hindi text here...",
lines=3
)
],
outputs=[
gr.HTML(label="Tokenized Text", elem_classes="token-viz"),
gr.HTML(label="Statistics", elem_classes="stats"),
gr.Textbox(label="Token IDs", elem_classes="token-ids")
],
title="Hindi BPE Tokenizer Visualization",
description="""
This demo shows how the Hindi BPE tokenizer processes text. Each token is highlighted with a different color.
Hover over the highlighted tokens to see their token IDs.
""",
examples=examples,
theme=gr.themes.Soft(),
css=custom_css,
allow_flagging="never"
)
if __name__ == "__main__":
iface.launch(share=True)