pardi-speech / tts /text_processor.py
Mehdi Lakbar
Initial demo of Lina-speech (pardi-speech)
56cfa73
raw
history blame
1.91 kB
from transformers import PreTrainedTokenizerFast
import re
import unicodedata
REPLACEMENTS = {
# whitespace
"\n": " ",
"\r": " ",
"\t": " ",
"\xa0": " ",
"\u2009": " ",
"\u202f": " ",
"\u200b": "",
# quotes
"‘": "'",
"’": "'",
"‚": "'",
"‛": "'",
"“": '"',
"”": '"',
"„": '"',
"«": '"',
"»": '"',
# dashes
"–": "-",
"—": "-",
"−": "-",
"-": "-",
# ellipsis
"…": "...",
# bullets & symbols
"•": ".",
"∙": ".",
"·": ".",
# currencies
"€": " euros",
"$": " dollars",
"£": " pounds",
"¥": " yen",
# misc
"°": " degrees",
"©": "",
"®": "",
"™": "",
}
def clean_text(text: str) -> str:
text = unicodedata.normalize("NFKC", text)
for src, tgt in REPLACEMENTS.items():
text = text.replace(src, tgt)
text = re.sub(r"\s+", " ", text.strip()) # collapse spaces
return text
class BasicTextProcessor:
"""
Basic text processor on top of a character level BPE model.
"""
def __init__(self, tokenizer_file: str):
self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_file)
def normalize(self, text: str) -> str:
"""Basic pre-normalization: whitespace cleanup, punctuation spacing, etc."""
text = clean_text(text)
text = text.strip()
return text
def __call__(self, text: str, **tokenizer_kwargs):
"""Normalize then tokenize."""
text = self.normalize(text)
return self.tokenizer.encode(text, **tokenizer_kwargs)
def detokenize(self, token_ids):
"""Optional: convert back to string."""
out = self.tokenizer.decode(token_ids, skip_special_tokens=False)
whitespace = "##[WHITESPACE]"
return out.replace(" ", whitespace).replace(" ", "").replace(whitespace, " ")