""" Configuration for Arabic Tokenizer Arena ========================================= Tokenizer registry, dataset configs, and sample texts """ from dataclasses import dataclass, field from typing import List, Dict from enum import Enum class TokenizerType(Enum): ARABIC_SPECIFIC = "Arabic-Specific" MULTILINGUAL_LLM = "Multilingual LLM" ARABIC_LLM = "Arabic LLM" ENCODER_ONLY = "Encoder-Only (BERT)" DECODER_ONLY = "Decoder-Only (GPT)" class TokenizerAlgorithm(Enum): BPE = "Byte-Pair Encoding (BPE)" BBPE = "Byte-Level BPE" WORDPIECE = "WordPiece" SENTENCEPIECE = "SentencePiece" UNIGRAM = "Unigram" TIKTOKEN = "Tiktoken" @dataclass class TokenizerInfo: """Metadata about a tokenizer""" name: str model_id: str type: TokenizerType algorithm: TokenizerAlgorithm vocab_size: int description: str organization: str arabic_support: str # Native, Adapted, Limited dialect_support: List[str] = field(default_factory=list) special_features: List[str] = field(default_factory=list) @dataclass class TokenizationMetrics: """Comprehensive tokenization evaluation metrics""" total_tokens: int total_words: int total_characters: int total_bytes: int fertility: float compression_ratio: float char_per_token: float oov_count: int oov_percentage: float single_token_words: int single_token_retention_rate: float avg_subwords_per_word: float max_subwords_per_word: int continued_words_ratio: float arabic_char_count: int arabic_token_count: int arabic_fertility: float diacritic_preservation: bool tokenization_time_ms: float tokens: List[str] = field(default_factory=list) token_ids: List[int] = field(default_factory=list) decoded_text: str = "" # ============================================================================ # TOKENIZER REGISTRY # ============================================================================ TOKENIZER_REGISTRY: Dict[str, TokenizerInfo] = { # ========== ARABIC-SPECIFIC BERT MODELS ========== "aubmindlab/bert-base-arabertv2": TokenizerInfo( name="AraBERT v2", model_id="aubmindlab/bert-base-arabertv2", type=TokenizerType.ENCODER_ONLY, algorithm=TokenizerAlgorithm.WORDPIECE, vocab_size=64000, description="Arabic BERT with Farasa segmentation, optimized for MSA", organization="AUB MIND Lab", arabic_support="Native", dialect_support=["MSA"], special_features=["Farasa preprocessing", "Morphological segmentation"] ), "aubmindlab/bert-large-arabertv2": TokenizerInfo( name="AraBERT v2 Large", model_id="aubmindlab/bert-large-arabertv2", type=TokenizerType.ENCODER_ONLY, algorithm=TokenizerAlgorithm.WORDPIECE, vocab_size=64000, description="Large Arabic BERT with enhanced capacity", organization="AUB MIND Lab", arabic_support="Native", dialect_support=["MSA"], special_features=["Large model", "Farasa preprocessing"] ), "CAMeL-Lab/bert-base-arabic-camelbert-mix": TokenizerInfo( name="CAMeLBERT Mix", model_id="CAMeL-Lab/bert-base-arabic-camelbert-mix", type=TokenizerType.ENCODER_ONLY, algorithm=TokenizerAlgorithm.WORDPIECE, vocab_size=30000, description="Pre-trained on MSA, DA, and Classical Arabic mix", organization="CAMeL Lab NYU Abu Dhabi", arabic_support="Native", dialect_support=["MSA", "DA", "CA"], special_features=["Multi-variant Arabic", "Classical Arabic support"] ), "CAMeL-Lab/bert-base-arabic-camelbert-msa": TokenizerInfo( name="CAMeLBERT MSA", model_id="CAMeL-Lab/bert-base-arabic-camelbert-msa", type=TokenizerType.ENCODER_ONLY, algorithm=TokenizerAlgorithm.WORDPIECE, vocab_size=30000, description="Specialized for Modern Standard Arabic", organization="CAMeL Lab NYU Abu Dhabi", arabic_support="Native", dialect_support=["MSA"], special_features=["MSA optimized"] ), "CAMeL-Lab/bert-base-arabic-camelbert-da": TokenizerInfo( name="CAMeLBERT DA", model_id="CAMeL-Lab/bert-base-arabic-camelbert-da", type=TokenizerType.ENCODER_ONLY, algorithm=TokenizerAlgorithm.WORDPIECE, vocab_size=30000, description="Specialized for Dialectal Arabic", organization="CAMeL Lab NYU Abu Dhabi", arabic_support="Native", dialect_support=["Egyptian", "Gulf", "Levantine", "Maghrebi"], special_features=["Dialect optimized"] ), "CAMeL-Lab/bert-base-arabic-camelbert-ca": TokenizerInfo( name="CAMeLBERT CA", model_id="CAMeL-Lab/bert-base-arabic-camelbert-ca", type=TokenizerType.ENCODER_ONLY, algorithm=TokenizerAlgorithm.WORDPIECE, vocab_size=30000, description="Specialized for Classical Arabic", organization="CAMeL Lab NYU Abu Dhabi", arabic_support="Native", dialect_support=["Classical"], special_features=["Classical Arabic", "Religious texts"] ), "UBC-NLP/MARBERT": TokenizerInfo( name="MARBERT", model_id="UBC-NLP/MARBERT", type=TokenizerType.ENCODER_ONLY, algorithm=TokenizerAlgorithm.WORDPIECE, vocab_size=100000, description="Multi-dialectal Arabic BERT trained on Twitter data", organization="UBC NLP", arabic_support="Native", dialect_support=["MSA", "Egyptian", "Gulf", "Levantine", "Maghrebi"], special_features=["Twitter data", "100K vocabulary", "Multi-dialect"] ), "UBC-NLP/ARBERT": TokenizerInfo( name="ARBERT", model_id="UBC-NLP/ARBERT", type=TokenizerType.ENCODER_ONLY, algorithm=TokenizerAlgorithm.WORDPIECE, vocab_size=100000, description="Arabic BERT focused on MSA with large vocabulary", organization="UBC NLP", arabic_support="Native", dialect_support=["MSA"], special_features=["100K vocabulary", "MSA focused"] ), "asafaya/bert-base-arabic": TokenizerInfo( name="Arabic BERT (Safaya)", model_id="asafaya/bert-base-arabic", type=TokenizerType.ENCODER_ONLY, algorithm=TokenizerAlgorithm.WORDPIECE, vocab_size=32000, description="Arabic BERT trained on MSA and dialectal Arabic", organization="Safaya", arabic_support="Native", dialect_support=["MSA", "DA"], special_features=["TPU trained", "Dialect support"] ), # ========== ARABIC-SPECIFIC TOKENIZERS ========== "riotu-lab/Aranizer-PBE-86k": TokenizerInfo( name="Aranizer PBE 86K", model_id="riotu-lab/Aranizer-PBE-86k", type=TokenizerType.ARABIC_SPECIFIC, algorithm=TokenizerAlgorithm.BPE, vocab_size=86000, description="Pair Byte Encoding tokenizer optimized for Arabic LLMs", organization="RIOTU Lab", arabic_support="Native", dialect_support=["MSA"], special_features=["Low fertility", "LLM optimized", "86K vocab"] ), "riotu-lab/Aranizer-SP-86k": TokenizerInfo( name="Aranizer SP 86K", model_id="riotu-lab/Aranizer-SP-86k", type=TokenizerType.ARABIC_SPECIFIC, algorithm=TokenizerAlgorithm.SENTENCEPIECE, vocab_size=86000, description="SentencePiece tokenizer optimized for Arabic", organization="RIOTU Lab", arabic_support="Native", dialect_support=["MSA"], special_features=["Low fertility", "SentencePiece", "86K vocab"] ), "riotu-lab/Aranizer-PBE-32k": TokenizerInfo( name="Aranizer PBE 32K", model_id="riotu-lab/Aranizer-PBE-32k", type=TokenizerType.ARABIC_SPECIFIC, algorithm=TokenizerAlgorithm.BPE, vocab_size=32000, description="Compact PBE tokenizer for Arabic", organization="RIOTU Lab", arabic_support="Native", dialect_support=["MSA"], special_features=["Compact", "LLM compatible"] ), "riotu-lab/Aranizer-SP-32k": TokenizerInfo( name="Aranizer SP 32K", model_id="riotu-lab/Aranizer-SP-32k", type=TokenizerType.ARABIC_SPECIFIC, algorithm=TokenizerAlgorithm.SENTENCEPIECE, vocab_size=32000, description="Compact SentencePiece tokenizer for Arabic", organization="RIOTU Lab", arabic_support="Native", dialect_support=["MSA"], special_features=["Compact", "Efficient"] ), # ========== ARABIC LLMs ========== "inception-mbzuai/jais-13b": TokenizerInfo( name="Jais 13B", model_id="inception-mbzuai/jais-13b", type=TokenizerType.ARABIC_LLM, algorithm=TokenizerAlgorithm.SENTENCEPIECE, vocab_size=84992, description="World's most advanced Arabic LLM, trained from scratch", organization="Inception/MBZUAI", arabic_support="Native", dialect_support=["MSA", "Gulf", "Egyptian", "Levantine"], special_features=["Arabic-first", "Lowest fertility", "UAE-native"] ), "inceptionai/jais-family-30b-8k-chat": TokenizerInfo( name="Jais 30B Chat", model_id="inceptionai/jais-family-30b-8k-chat", type=TokenizerType.ARABIC_LLM, algorithm=TokenizerAlgorithm.SENTENCEPIECE, vocab_size=84992, description="Enhanced 30B version with chat capabilities", organization="Inception AI", arabic_support="Native", dialect_support=["MSA", "Gulf", "Egyptian", "Levantine"], special_features=["30B parameters", "Chat optimized", "8K context"] ), "FreedomIntelligence/AceGPT-13B-chat": TokenizerInfo( name="AceGPT 13B Chat", model_id="FreedomIntelligence/AceGPT-13B-chat", type=TokenizerType.ARABIC_LLM, algorithm=TokenizerAlgorithm.SENTENCEPIECE, vocab_size=32000, description="Arabic-enhanced LLaMA with cultural alignment and chat", organization="Freedom Intelligence", arabic_support="Adapted", dialect_support=["MSA"], special_features=["LLaMA-based", "Cultural alignment", "RLHF", "Chat"] ), "silma-ai/SILMA-9B-Instruct-v1.0": TokenizerInfo( name="SILMA 9B Instruct", model_id="silma-ai/SILMA-9B-Instruct-v1.0", type=TokenizerType.ARABIC_LLM, algorithm=TokenizerAlgorithm.SENTENCEPIECE, vocab_size=256000, description="Top-ranked Arabic LLM based on Gemma, outperforms larger models", organization="SILMA AI", arabic_support="Native", dialect_support=["MSA", "Gulf", "Egyptian", "Levantine"], special_features=["Gemma-based", "SOTA 9B class", "Efficient"] ), "silma-ai/SILMA-Kashif-2B-Instruct-v1.0": TokenizerInfo( name="SILMA Kashif 2B (RAG)", model_id="silma-ai/SILMA-Kashif-2B-Instruct-v1.0", type=TokenizerType.ARABIC_LLM, algorithm=TokenizerAlgorithm.SENTENCEPIECE, vocab_size=256000, description="RAG-optimized Arabic model, excellent for context-based QA", organization="SILMA AI", arabic_support="Native", dialect_support=["MSA"], special_features=["RAG optimized", "12K context", "Compact"] ), "QCRI/Fanar-1-9B-Instruct": TokenizerInfo( name="Fanar 9B Instruct", model_id="QCRI/Fanar-1-9B-Instruct", type=TokenizerType.ARABIC_LLM, algorithm=TokenizerAlgorithm.SENTENCEPIECE, vocab_size=256000, description="Qatar's Arabic LLM aligned with Islamic values and Arab culture", organization="QCRI (Qatar)", arabic_support="Native", dialect_support=["MSA", "Gulf", "Egyptian", "Levantine"], special_features=["Islamic RAG", "Cultural alignment", "Gemma-based"] ), "stabilityai/ar-stablelm-2-chat": TokenizerInfo( name="Arabic StableLM 2 Chat", model_id="stabilityai/ar-stablelm-2-chat", type=TokenizerType.ARABIC_LLM, algorithm=TokenizerAlgorithm.BPE, vocab_size=100289, description="Stability AI's Arabic instruction-tuned 1.6B model", organization="Stability AI", arabic_support="Native", dialect_support=["MSA"], special_features=["Compact 1.6B", "Chat optimized", "Efficient"] ), "Navid-AI/Yehia-7B-preview": TokenizerInfo( name="Yehia 7B Preview", model_id="Navid-AI/Yehia-7B-preview", type=TokenizerType.ARABIC_LLM, algorithm=TokenizerAlgorithm.BPE, vocab_size=128256, description="Best Arabic model on AraGen-Leaderboard (0.5B-25B), GRPO trained", organization="Navid AI", arabic_support="Native", dialect_support=["MSA", "Gulf", "Egyptian", "Levantine"], special_features=["GRPO trained", "3C3H aligned", "SOTA AraGen"] ), # ========== DIALECT-SPECIFIC MODELS ========== "MBZUAI-Paris/Atlas-Chat-9B": TokenizerInfo( name="Atlas-Chat 9B (Darija)", model_id="MBZUAI-Paris/Atlas-Chat-9B", type=TokenizerType.ARABIC_LLM, algorithm=TokenizerAlgorithm.SENTENCEPIECE, vocab_size=256000, description="First LLM for Moroccan Arabic (Darija), Gemma-based", organization="MBZUAI Paris", arabic_support="Native", dialect_support=["Darija", "MSA"], special_features=["Moroccan dialect", "Transliteration", "Cultural"] ), "MBZUAI-Paris/Atlas-Chat-2B": TokenizerInfo( name="Atlas-Chat 2B (Darija)", model_id="MBZUAI-Paris/Atlas-Chat-2B", type=TokenizerType.ARABIC_LLM, algorithm=TokenizerAlgorithm.SENTENCEPIECE, vocab_size=256000, description="Compact Moroccan Arabic model for edge deployment", organization="MBZUAI Paris", arabic_support="Native", dialect_support=["Darija", "MSA"], special_features=["Compact", "Moroccan dialect", "Edge-ready"] ), # ========== MULTILINGUAL LLMs ========== "Qwen/Qwen2.5-7B": TokenizerInfo( name="Qwen 2.5 7B", model_id="Qwen/Qwen2.5-7B", type=TokenizerType.MULTILINGUAL_LLM, algorithm=TokenizerAlgorithm.BPE, vocab_size=151936, description="Alibaba's multilingual LLM with 30+ language support", organization="Alibaba Qwen", arabic_support="Supported", dialect_support=["MSA"], special_features=["152K vocab", "128K context", "30+ languages"] ), "google/gemma-2-9b": TokenizerInfo( name="Gemma 2 9B", model_id="google/gemma-2-9b", type=TokenizerType.MULTILINGUAL_LLM, algorithm=TokenizerAlgorithm.SENTENCEPIECE, vocab_size=256000, description="Google's efficient multilingual model", organization="Google", arabic_support="Supported", dialect_support=["MSA"], special_features=["256K vocab", "Efficient architecture"] ), "mistralai/Mistral-7B-v0.3": TokenizerInfo( name="Mistral 7B v0.3", model_id="mistralai/Mistral-7B-v0.3", type=TokenizerType.MULTILINGUAL_LLM, algorithm=TokenizerAlgorithm.SENTENCEPIECE, vocab_size=32768, description="Efficient multilingual model with sliding window attention", organization="Mistral AI", arabic_support="Limited", dialect_support=["MSA"], special_features=["Sliding window", "Efficient"] ), "mistralai/Mistral-Nemo-Base-2407": TokenizerInfo( name="Mistral Nemo", model_id="mistralai/Mistral-Nemo-Base-2407", type=TokenizerType.MULTILINGUAL_LLM, algorithm=TokenizerAlgorithm.TIKTOKEN, vocab_size=131072, description="Uses Tekken tokenizer, optimized for multilingual", organization="Mistral AI + NVIDIA", arabic_support="Supported", dialect_support=["MSA"], special_features=["Tekken tokenizer", "131K vocab", "Multilingual optimized"] ), "xlm-roberta-base": TokenizerInfo( name="XLM-RoBERTa Base", model_id="xlm-roberta-base", type=TokenizerType.MULTILINGUAL_LLM, algorithm=TokenizerAlgorithm.SENTENCEPIECE, vocab_size=250002, description="Cross-lingual model covering 100 languages", organization="Facebook AI", arabic_support="Supported", dialect_support=["MSA"], special_features=["250K vocab", "100 languages"] ), "bert-base-multilingual-cased": TokenizerInfo( name="mBERT", model_id="bert-base-multilingual-cased", type=TokenizerType.MULTILINGUAL_LLM, algorithm=TokenizerAlgorithm.WORDPIECE, vocab_size=119547, description="Original multilingual BERT, baseline for comparison", organization="Google", arabic_support="Limited", dialect_support=["MSA"], special_features=["Baseline model", "104 languages"] ), "tiiuae/falcon-7b": TokenizerInfo( name="Falcon 7B", model_id="tiiuae/falcon-7b", type=TokenizerType.MULTILINGUAL_LLM, algorithm=TokenizerAlgorithm.BPE, vocab_size=65024, description="TII's powerful open-source LLM", organization="Technology Innovation Institute", arabic_support="Limited", dialect_support=["MSA"], special_features=["65K vocab", "RefinedWeb trained"] ), } # ============================================================================ # LEADERBOARD DATASETS - Real HuggingFace Datasets # ============================================================================ LEADERBOARD_DATASETS = { "arabic_mmlu": { "hf_id": "MBZUAI/ArabicMMLU", "name": "ArabicMMLU", "category": "MSA Benchmark", "text_column": "Question", "split": "test", "subset": "All", "samples": 5000, "description": "Multi-task benchmark from Arab school exams" }, "astd": { "hf_id": "arbml/ASTD", "name": "ASTD (Egyptian)", "category": "Egyptian Dialect", "text_column": "tweet", "split": "train", "subset": None, "samples": 5000, "description": "Egyptian Arabic sentiment tweets" }, "athar": { "hf_id": "mohamed-khalil/ATHAR", "name": "ATHAR Classical", "category": "Classical Arabic", "text_column": "arabic", "split": "train", "subset": None, "samples": 5000, "description": "Classical Arabic sentences" }, "arcd": { "hf_id": "arcd", "name": "ARCD", "category": "QA Dataset", "text_column": "context", "split": "train", "subset": None, "samples": 1395, "description": "Arabic Reading Comprehension Dataset" }, "ashaar": { "hf_id": "arbml/Ashaar_dataset", "name": "Ashaar Poetry", "category": "Poetry", "text_column": "poem_text", "split": "train", "subset": None, "samples": 5000, "description": "Arabic poetry verses" }, "hadith": { "hf_id": "gurgutan/sunnah_ar_en_dataset", "name": "Hadith", "category": "Religious", "text_column": "hadith_text_ar", "split": "train", "subset": None, "samples": 5000, "description": "Hadith collection" }, "arabic_sentiment": { "hf_id": "arbml/Arabic_Sentiment_Twitter_Corpus", "name": "Arabic Sentiment", "category": "Social Media", "text_column": "tweet", "split": "train", "subset": None, "samples": 5000, "description": "Arabic Twitter sentiment" }, "sanad": { "hf_id": "arbml/SANAD", "name": "SANAD News", "category": "News", "text_column": "Article", "split": "train", "subset": None, "samples": 5000, "description": "Arabic news articles" }, } # ============================================================================ # SAMPLE TEXTS # ============================================================================ SAMPLE_TEXTS = { "MSA News": "أعلنت وزارة التربية والتعليم عن بدء العام الدراسي الجديد في الأول من سبتمبر، حيث ستعود المدارس لاستقبال الطلاب بعد العطلة الصيفية الطويلة.", "MSA Formal": "إن تطوير تقنيات الذكاء الاصطناعي يمثل نقلة نوعية في مجال معالجة اللغات الطبيعية، وخاصة فيما يتعلق باللغة العربية ذات الخصائص المورفولوجية الغنية.", "Egyptian Dialect": "ازيك يا صاحبي؟ إيه أخبارك؟ عامل إيه النهارده؟ قولي هنروح فين بكره؟", "Gulf Dialect": "شلونك؟ شخبارك؟ وش تسوي الحين؟ ودك تروح وياي للسوق؟", "Levantine Dialect": "كيفك؟ شو أخبارك؟ شو عم تعمل هلق؟ بدك تيجي معي على السوق؟", "Classical Arabic (Quran)": "بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ ۝ الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ", "Poetry": "وما من كاتبٍ إلا سيفنى ويُبقي الدهرُ ما كتبت يداهُ", "Technical": "يستخدم نموذج المحولات آلية الانتباه الذاتي لمعالجة تسلسلات النصوص بشكل متوازي.", "Mixed Arabic-English": "The Arabic language العربية is a Semitic language with over 400 million speakers worldwide.", "With Diacritics": "إِنَّ اللَّهَ وَمَلَائِكَتَهُ يُصَلُّونَ عَلَى النَّبِيِّ", }