Spaces:
Running
on
Zero
Running
on
Zero
| from transformers import PreTrainedTokenizerFast | |
| import re | |
| import unicodedata | |
| REPLACEMENTS = { | |
| # whitespace | |
| "\n": " ", | |
| "\r": " ", | |
| "\t": " ", | |
| "\xa0": " ", | |
| "\u2009": " ", | |
| "\u202f": " ", | |
| "\u200b": "", | |
| # quotes | |
| "‘": "'", | |
| "’": "'", | |
| "‚": "'", | |
| "‛": "'", | |
| "“": '"', | |
| "”": '"', | |
| "„": '"', | |
| "«": '"', | |
| "»": '"', | |
| # dashes | |
| "–": "-", | |
| "—": "-", | |
| "−": "-", | |
| "-": "-", | |
| # ellipsis | |
| "…": "...", | |
| # bullets & symbols | |
| "•": ".", | |
| "∙": ".", | |
| "·": ".", | |
| # currencies | |
| "€": " euros", | |
| "$": " dollars", | |
| "£": " pounds", | |
| "¥": " yen", | |
| # misc | |
| "°": " degrees", | |
| "©": "", | |
| "®": "", | |
| "™": "", | |
| } | |
| def clean_text(text: str) -> str: | |
| text = unicodedata.normalize("NFKC", text) | |
| for src, tgt in REPLACEMENTS.items(): | |
| text = text.replace(src, tgt) | |
| text = re.sub(r"\s+", " ", text.strip()) # collapse spaces | |
| return text | |
| class BasicTextProcessor: | |
| """ | |
| Basic text processor on top of a character level BPE model. | |
| """ | |
| def __init__(self, tokenizer_file: str): | |
| self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_file) | |
| def normalize(self, text: str) -> str: | |
| """Basic pre-normalization: whitespace cleanup, punctuation spacing, etc.""" | |
| text = clean_text(text) | |
| text = text.strip() | |
| return text | |
| def __call__(self, text: str, **tokenizer_kwargs): | |
| """Normalize then tokenize.""" | |
| text = self.normalize(text) | |
| return self.tokenizer.encode(text, **tokenizer_kwargs) | |
| def detokenize(self, token_ids): | |
| """Optional: convert back to string.""" | |
| out = self.tokenizer.decode(token_ids, skip_special_tokens=False) | |
| whitespace = "##[WHITESPACE]" | |
| return out.replace(" ", whitespace).replace(" ", "").replace(whitespace, " ") | |