Spaces:

mehdi999
/

pardi-speech

Running on Zero

pardi-speech / tts /text_processor.py

Mehdi Lakbar

Initial demo of Lina-speech (pardi-speech)

56cfa73 25 days ago

1.91 kB

	from transformers import PreTrainedTokenizerFast
	import re
	import unicodedata

	REPLACEMENTS = {
	# whitespace
	"\n": " ",
	"\r": " ",
	"\t": " ",
	"\xa0": " ",
	"\u2009": " ",
	"\u202f": " ",
	"\u200b": "",
	# quotes
	"‘": "'",
	"’": "'",
	"‚": "'",
	"‛": "'",
	"“": '"',
	"”": '"',
	"„": '"',
	"«": '"',
	"»": '"',
	# dashes
	"–": "-",
	"—": "-",
	"−": "-",
	"-": "-",
	# ellipsis
	"…": "...",
	# bullets & symbols
	"•": ".",
	"∙": ".",
	"·": ".",
	# currencies
	"€": " euros",
	"$": " dollars",
	"£": " pounds",
	"¥": " yen",
	# misc
	"°": " degrees",
	"©": "",
	"®": "",
	"™": "",
	}


	def clean_text(text: str) -> str:
	text = unicodedata.normalize("NFKC", text)
	for src, tgt in REPLACEMENTS.items():
	text = text.replace(src, tgt)
	text = re.sub(r"\s+", " ", text.strip()) # collapse spaces
	return text


	class BasicTextProcessor:
	"""
	Basic text processor on top of a character level BPE model.
	"""

	def __init__(self, tokenizer_file: str):
	self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_file)

	def normalize(self, text: str) -> str:
	"""Basic pre-normalization: whitespace cleanup, punctuation spacing, etc."""
	text = clean_text(text)
	text = text.strip()
	return text

	def __call__(self, text: str, **tokenizer_kwargs):
	"""Normalize then tokenize."""
	text = self.normalize(text)
	return self.tokenizer.encode(text, **tokenizer_kwargs)

	def detokenize(self, token_ids):
	"""Optional: convert back to string."""
	out = self.tokenizer.decode(token_ids, skip_special_tokens=False)
	whitespace = "##[WHITESPACE]"
	return out.replace(" ", whitespace).replace(" ", "").replace(whitespace, " ")