import gradio as gr from tokenizer import Tokenizer, load_config import json import html # Load the tokenizer config = load_config("config_app.yml") tokenizer = Tokenizer.load(config["tokenizer_file_path"]) tokenizer.config = config def highlight_tokens(text: str, encoded_tokens: list) -> str: """ Create HTML with highlighted tokens in the text. Args: text (str): The original input text to be tokenized. encoded_tokens (list): A list of encoded token IDs. Returns: str: HTML string with highlighted tokens and tooltips showing token IDs. """ decoded_tokens = [] current_pos = 0 html_text = "" # Decode each token and create spans with different colors for i, token in enumerate(encoded_tokens): token_bytes = tokenizer.decode([token]) decoded_tokens.append(token_bytes) # Find the token in the original text token_pos = text.find(token_bytes, current_pos) if token_pos != -1: # Add any skipped text if token_pos > current_pos: html_text += html.escape(text[current_pos:token_pos]) # Add the highlighted token with improved tooltip color = f"hsl({(i * 60) % 360}, 80%, 85%)" html_text += f''' {html.escape(token_bytes)} ''' current_pos = token_pos + len(token_bytes) # Add any remaining text if current_pos < len(text): html_text += html.escape(text[current_pos:]) return html_text def process_text(text: str) -> tuple: """ Process input text through the tokenizer and return results. Args: text (str): The input text to be processed. Returns: tuple: A tuple containing: - HTML string of highlighted tokens. - HTML string of token statistics. - String of token IDs. """ try: # Encode the text encoded = tokenizer.encode(text) # Decode back to text decoded = tokenizer.decode(encoded) # Create token visualization highlighted_text = highlight_tokens(text, encoded) # Token statistics stats = { "Total Tokens": len(encoded), "Unique Tokens": len(set(encoded)), "Characters": len(text), "Bytes": len(text.encode('utf-8')), "Compression Ratio": f"{len(text.encode('utf-8')) / (len(encoded) * 4):.2f}x" } # Format statistics stats_html = "