from transformers import PreTrainedTokenizerFast import re import unicodedata REPLACEMENTS = { # whitespace "\n": " ", "\r": " ", "\t": " ", "\xa0": " ", "\u2009": " ", "\u202f": " ", "\u200b": "", # quotes "‘": "'", "’": "'", "‚": "'", "‛": "'", "“": '"', "”": '"', "„": '"', "«": '"', "»": '"', # dashes "–": "-", "—": "-", "−": "-", "-": "-", # ellipsis "…": "...", # bullets & symbols "•": ".", "∙": ".", "·": ".", # currencies "€": " euros", "$": " dollars", "£": " pounds", "¥": " yen", # misc "°": " degrees", "©": "", "®": "", "™": "", } def clean_text(text: str) -> str: text = unicodedata.normalize("NFKC", text) for src, tgt in REPLACEMENTS.items(): text = text.replace(src, tgt) text = re.sub(r"\s+", " ", text.strip()) # collapse spaces return text class BasicTextProcessor: """ Basic text processor on top of a character level BPE model. """ def __init__(self, tokenizer_file: str): self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_file) def normalize(self, text: str) -> str: """Basic pre-normalization: whitespace cleanup, punctuation spacing, etc.""" text = clean_text(text) text = text.strip() return text def __call__(self, text: str, **tokenizer_kwargs): """Normalize then tokenize.""" text = self.normalize(text) return self.tokenizer.encode(text, **tokenizer_kwargs) def detokenize(self, token_ids): """Optional: convert back to string.""" out = self.tokenizer.decode(token_ids, skip_special_tokens=False) whitespace = "##[WHITESPACE]" return out.replace(" ", whitespace).replace(" ", "").replace(whitespace, " ")