Spaces:

mehdi999
/

pardi-speech

Running on Zero

File size: 1,907 Bytes

56cfa73

from transformers import PreTrainedTokenizerFast
import re
import unicodedata

REPLACEMENTS = {
    # whitespace
    "\n": " ",
    "\r": " ",
    "\t": " ",
    "\xa0": " ",
    "\u2009": " ",
    "\u202f": " ",
    "\u200b": "",
    # quotes
    "‘": "'",
    "’": "'",
    "‚": "'",
    "‛": "'",
    "“": '"',
    "”": '"',
    "„": '"',
    "«": '"',
    "»": '"',
    # dashes
    "–": "-",
    "—": "-",
    "−": "-",
    "-": "-",
    # ellipsis
    "…": "...",
    # bullets & symbols
    "•": ".",
    "∙": ".",
    "·": ".",
    # currencies
    "€": " euros",
    "$": " dollars",
    "£": " pounds",
    "¥": " yen",
    # misc
    "°": " degrees",
    "©": "",
    "®": "",
    "™": "",
}


def clean_text(text: str) -> str:
    text = unicodedata.normalize("NFKC", text)
    for src, tgt in REPLACEMENTS.items():
        text = text.replace(src, tgt)
    text = re.sub(r"\s+", " ", text.strip())  # collapse spaces
    return text


class BasicTextProcessor:
    """
    Basic text processor on top of a character level BPE model.
    """

    def __init__(self, tokenizer_file: str):
        self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_file)

    def normalize(self, text: str) -> str:
        """Basic pre-normalization: whitespace cleanup, punctuation spacing, etc."""
        text = clean_text(text)
        text = text.strip()
        return text

    def __call__(self, text: str, **tokenizer_kwargs):
        """Normalize then tokenize."""
        text = self.normalize(text)
        return self.tokenizer.encode(text, **tokenizer_kwargs)

    def detokenize(self, token_ids):
        """Optional: convert back to string."""
        out = self.tokenizer.decode(token_ids, skip_special_tokens=False)
        whitespace = "##[WHITESPACE]"
        return out.replace("   ", whitespace).replace(" ", "").replace(whitespace, " ")