""" Tokenization Analysis ===================== Core analysis functions for evaluating tokenizers """ import time from typing import Tuple from config import TokenizerInfo, TokenizationMetrics from utils import count_arabic_chars, get_arabic_words, has_diacritics, is_arabic_char from tokenizer_manager import tokenizer_manager def analyze_tokenization( text: str, model_id: str, tokenizer_info: TokenizerInfo ) -> TokenizationMetrics: """Perform comprehensive tokenization analysis""" tokenizer = tokenizer_manager.get_tokenizer(model_id) # Time the tokenization start_time = time.perf_counter() tokens = tokenizer.tokenize(text) token_ids = tokenizer.encode(text, add_special_tokens=False) tokenization_time = (time.perf_counter() - start_time) * 1000 decoded = tokenizer.decode(token_ids, skip_special_tokens=True) # Basic counts words = text.split() total_words = len(words) total_tokens = len(tokens) total_characters = len(text) total_bytes = len(text.encode('utf-8')) # Efficiency metrics fertility = total_tokens / max(total_words, 1) compression_ratio = total_bytes / max(total_tokens, 1) char_per_token = total_characters / max(total_tokens, 1) # OOV analysis unk_token = tokenizer.unk_token if hasattr(tokenizer, 'unk_token') else '[UNK]' oov_count = sum(1 for t in tokens if t == unk_token or '[UNK]' in str(t)) oov_percentage = (oov_count / max(total_tokens, 1)) * 100 # Single Token Retention Rate (STRR) single_token_words = 0 subwords_per_word = [] for word in words: word_tokens = tokenizer.tokenize(word) subwords_per_word.append(len(word_tokens)) if len(word_tokens) == 1: single_token_words += 1 strr = single_token_words / max(total_words, 1) avg_subwords = sum(subwords_per_word) / max(len(subwords_per_word), 1) max_subwords = max(subwords_per_word) if subwords_per_word else 0 continued_ratio = (total_words - single_token_words) / max(total_words, 1) # Arabic-specific metrics arabic_char_count = count_arabic_chars(text) arabic_words = get_arabic_words(text) arabic_tokens_count = 0 for token in tokens: if any(is_arabic_char(c) for c in str(token)): arabic_tokens_count += 1 arabic_fertility = arabic_tokens_count / max(len(arabic_words), 1) if arabic_words else 0 diacritic_preserved = has_diacritics(text) == has_diacritics(decoded) return TokenizationMetrics( total_tokens=total_tokens, total_words=total_words, total_characters=total_characters, total_bytes=total_bytes, fertility=fertility, compression_ratio=compression_ratio, char_per_token=char_per_token, oov_count=oov_count, oov_percentage=oov_percentage, single_token_words=single_token_words, single_token_retention_rate=strr, avg_subwords_per_word=avg_subwords, max_subwords_per_word=max_subwords, continued_words_ratio=continued_ratio, arabic_char_count=arabic_char_count, arabic_token_count=arabic_tokens_count, arabic_fertility=arabic_fertility, diacritic_preservation=diacritic_preserved, tokenization_time_ms=tokenization_time, tokens=tokens, token_ids=token_ids, decoded_text=decoded ) def analyze_single_tokenizer(tokenizer_choice: str, text: str) -> Tuple[str, str, str, str]: """Analyze a single tokenizer - returns HTML outputs""" from ui_components import ( generate_tokenizer_info_card, generate_metrics_card, generate_token_visualization, generate_decoded_section ) if not text or not text.strip(): return ( '
⚠️ Please enter some text to analyze
', '', '', '' ) if not tokenizer_choice: return ( '
⚠️ Please select a tokenizer
', '', '', '' ) model_id = tokenizer_manager.get_model_id_from_choice(tokenizer_choice) tokenizer_info = tokenizer_manager.get_available_tokenizers().get(model_id) if not tokenizer_info: return ( '

Error

Tokenizer not found

', '', '', '' ) try: metrics = analyze_tokenization(text, model_id, tokenizer_info) info_html = generate_tokenizer_info_card(tokenizer_info) metrics_html = generate_metrics_card(metrics, tokenizer_info) tokens_html = generate_token_visualization(metrics.tokens, metrics.token_ids) decoded_html = generate_decoded_section(metrics) return info_html, metrics_html, tokens_html, decoded_html except Exception as e: return ( f'

Error

{str(e)}

', '', '', '' ) def compare_tokenizers(tokenizer_choices: list, text: str) -> str: """Compare multiple tokenizers - returns HTML table""" from config import TokenizationMetrics if not text or not text.strip(): return '
⚠️ Please enter some text to analyze
' if not tokenizer_choices or len(tokenizer_choices) < 2: return '
⚠️ Please select at least 2 tokenizers to compare
' results = [] for choice in tokenizer_choices: model_id = tokenizer_manager.get_model_id_from_choice(choice) tokenizer_info = tokenizer_manager.get_available_tokenizers().get(model_id) if tokenizer_info: try: metrics = analyze_tokenization(text, model_id, tokenizer_info) results.append({ 'name': tokenizer_info.name, 'org': tokenizer_info.organization, 'type': tokenizer_info.type.value, 'metrics': metrics }) except Exception as e: results.append({ 'name': tokenizer_info.name, 'org': tokenizer_info.organization, 'type': tokenizer_info.type.value, 'error': str(e) }) # Sort by fertility (lower is better) def get_fertility(x): if 'error' in x: return 999 return x['metrics'].fertility results.sort(key=get_fertility) # Generate comparison table html = '''
''' for i, result in enumerate(results): rank = i + 1 rank_class = 'rank-1' if rank == 1 else 'rank-2' if rank == 2 else 'rank-3' if rank == 3 else '' if 'error' in result: html += f''' ''' else: m = result['metrics'] fertility_class = 'excellent' if m.fertility < 1.5 else 'good' if m.fertility < 2.5 else 'poor' html += f''' ''' html += '''
Rank Tokenizer Type Tokens Fertility ↓ Compression ↑ STRR ↑ OOV %
#{rank} {result['name']}
{result['org']}
{result['type']} Error: {result['error']}
#{rank} {result['name']}
{result['org']}
{result['type']} {m.total_tokens} {m.fertility:.3f} {m.compression_ratio:.2f} {m.single_token_retention_rate:.1%} {m.oov_percentage:.1f}%
''' return html