""" Arabic Text Utilities ===================== Helper functions for Arabic text analysis """ import re from typing import List def is_arabic_char(char: str) -> bool: """Check if character is Arabic""" if len(char) != 1: return False code = ord(char) return ( (0x0600 <= code <= 0x06FF) or # Arabic (0x0750 <= code <= 0x077F) or # Arabic Supplement (0x08A0 <= code <= 0x08FF) or # Arabic Extended-A (0xFB50 <= code <= 0xFDFF) or # Arabic Presentation Forms-A (0xFE70 <= code <= 0xFEFF) # Arabic Presentation Forms-B ) def count_arabic_chars(text: str) -> int: """Count Arabic characters in text""" return sum(1 for c in text if is_arabic_char(c)) def has_diacritics(text: str) -> bool: """Check if text contains Arabic diacritics (tashkeel)""" diacritics = set('ًٌٍَُِّْٰ') return any(c in diacritics for c in text) def normalize_arabic(text: str) -> str: """Basic Arabic normalization""" # Normalize alef variants text = re.sub('[إأآا]', 'ا', text) # Normalize yeh text = re.sub('ى', 'ي', text) # Normalize teh marbuta text = re.sub('ة', 'ه', text) return text def get_arabic_words(text: str) -> List[str]: """Extract Arabic words from text""" words = text.split() return [w for w in words if any(is_arabic_char(c) for c in w)] def remove_diacritics(text: str) -> str: """Remove Arabic diacritics from text""" diacritics = 'ًٌٍَُِّْٰ' return ''.join(c for c in text if c not in diacritics)