import gradio as gr from tokenizer import Tokenizer, load_config import json import html # Load the tokenizer config = load_config("config_app.yml") tokenizer = Tokenizer.load(config["tokenizer_file_path"]) tokenizer.config = config def highlight_tokens(text: str, encoded_tokens: list) -> str: """ Create HTML with highlighted tokens in the text. Args: text (str): The original input text to be tokenized. encoded_tokens (list): A list of encoded token IDs. Returns: str: HTML string with highlighted tokens and tooltips showing token IDs. """ decoded_tokens = [] current_pos = 0 html_text = "" # Decode each token and create spans with different colors for i, token in enumerate(encoded_tokens): token_bytes = tokenizer.decode([token]) decoded_tokens.append(token_bytes) # Find the token in the original text token_pos = text.find(token_bytes, current_pos) if token_pos != -1: # Add any skipped text if token_pos > current_pos: html_text += html.escape(text[current_pos:token_pos]) # Add the highlighted token with improved tooltip color = f"hsl({(i * 60) % 360}, 80%, 85%)" html_text += f''' {html.escape(token_bytes)} ''' current_pos = token_pos + len(token_bytes) # Add any remaining text if current_pos < len(text): html_text += html.escape(text[current_pos:]) return html_text def process_text(text: str) -> tuple: """ Process input text through the tokenizer and return results. Args: text (str): The input text to be processed. Returns: tuple: A tuple containing: - HTML string of highlighted tokens. - HTML string of token statistics. - String of token IDs. """ try: # Encode the text encoded = tokenizer.encode(text) # Decode back to text decoded = tokenizer.decode(encoded) # Create token visualization highlighted_text = highlight_tokens(text, encoded) # Token statistics stats = { "Total Tokens": len(encoded), "Unique Tokens": len(set(encoded)), "Characters": len(text), "Bytes": len(text.encode('utf-8')), "Compression Ratio": f"{len(text.encode('utf-8')) / (len(encoded) * 4):.2f}x" } # Format statistics stats_html = "
" for key, value in stats.items(): stats_html += f"
{key}: {value}
" stats_html += "
" return ( gr.HTML(highlighted_text), gr.HTML(stats_html), f"Token IDs: {encoded}" ) except Exception as e: return ( gr.HTML(f"Error: {str(e)}"), "", "" ) # Define example inputs examples = [ ["यहां वर्तमान में 20 हजार पुस्तकें थी जो अभी रैन बसेरा परिसर के कक्ष में रखी हुई है।"], ["भारत एक विशाल देश है।"], ["मैं हिंदी में बात कर रहा हूं।"], ["नमस्ते, आप कैसे हैं?"], ["दिल्ली भारत की राजधानी है।"] ] # Custom CSS custom_css = """ .container { max-width: 800px; margin: auto; padding: 20px; } .token-viz { font-family: monospace; line-height: 1.6; padding: 15px; border: 1px solid #ddd; border-radius: 5px; background: white; margin: 10px 0; position: relative; } .stats { background: #f7f7f7; padding: 15px; border-radius: 5px; margin: 10px 0; } .token-ids { font-family: monospace; padding: 15px; background: #f0f0f0; border-radius: 5px; overflow-wrap: break-word; } .tooltip { pointer-events: none; box-shadow: 0 2px 4px rgba(0,0,0,0.2); } """ # Create the Gradio interface iface = gr.Interface( fn=process_text, inputs=[ gr.Textbox( label="Input Text", placeholder="Enter Hindi text here...", lines=3 ) ], outputs=[ gr.HTML(label="Tokenized Text", elem_classes="token-viz"), gr.HTML(label="Statistics", elem_classes="stats"), gr.Textbox(label="Token IDs", elem_classes="token-ids") ], title="Hindi BPE Tokenizer Visualization", description=""" This demo shows how the Hindi BPE tokenizer processes text. Each token is highlighted with a different color. Hover over the highlighted tokens to see their token IDs. """, examples=examples, theme=gr.themes.Soft(), css=custom_css, allow_flagging="never" ) if __name__ == "__main__": iface.launch(share=True)