# -*- coding: utf-8 -*- # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import pathlib import random import re import time from abc import ABC, abstractmethod from collections import defaultdict from typing import Callable, Dict, List, Optional, Set, Tuple, Union import nltk import torch from nemo.collections.common.tokenizers.text_to_speech.ipa_lexicon import validate_locale from nemo.collections.common.tokenizers.text_to_speech.tokenizer_utils import ( LATIN_CHARS_ALL, any_locale_word_tokenize, english_word_tokenize, normalize_unicode_text, ) from nemo.collections.tts.parts.utils.g2p_utils import GRAPHEME_CASE_MIXED, GRAPHEME_CASE_UPPER, set_grapheme_case from nemo.utils import logging from nemo.utils.decorators import experimental from nemo.utils.get_rank import is_global_rank_zero class BaseG2p(ABC): def __init__( self, phoneme_dict=None, word_tokenize_func=lambda x: x, apply_to_oov_word=None, mapping_file: Optional[str] = None, ): """Abstract class for creating an arbitrary module to convert grapheme words to phoneme sequences, leave unchanged, or use apply_to_oov_word. Args: phoneme_dict: Arbitrary representation of dictionary (phoneme -> grapheme) for known words. word_tokenize_func: Function for tokenizing text to words. apply_to_oov_word: Function that will be applied to out of phoneme_dict word. """ self.phoneme_dict = phoneme_dict self.word_tokenize_func = word_tokenize_func self.apply_to_oov_word = apply_to_oov_word self.mapping_file = mapping_file self.heteronym_model = None # heteronym classification model @abstractmethod def __call__(self, text: str) -> str: pass def setup_heteronym_model( self, heteronym_model, wordid_to_phonemes_file: str = "../../../scripts/tts_dataset_files/wordid_to_ipa-0.7b_nv22.10.tsv", ): """ Add heteronym classification model to TTS preprocessing pipeline to disambiguate heteronyms. Heteronym model has a list of supported heteronyms but only heteronyms specified in wordid_to_phonemes_file will be converted to phoneme form during heteronym model inference; the rest will be left in grapheme form. Args: heteronym_model: Initialized HeteronymClassificationModel wordid_to_phonemes_file: Path to a file with mapping from wordid predicted by heteronym model to phonemes """ try: from nemo.collections.tts.models.g2p_heteronym_classification import HeteronymClassificationModel self.heteronym_model = heteronym_model self.heteronym_model.set_wordid_to_phonemes(wordid_to_phonemes_file) except ImportError as e: logging.warning("Heteronym model setup will be skipped") logging.error(e) class EnglishG2p(BaseG2p): def __init__( self, phoneme_dict=None, word_tokenize_func=english_word_tokenize, apply_to_oov_word=None, ignore_ambiguous_words=True, heteronyms=None, encoding='latin-1', phoneme_probability: Optional[float] = None, mapping_file: Optional[str] = None, ): """English G2P module. This module converts words from grapheme to phoneme representation using phoneme_dict in CMU dict format. Optionally, it can ignore words which are heteronyms, ambiguous or marked as unchangeable by word_tokenize_func (see code for details). Ignored words are left unchanged or passed through apply_to_oov_word for handling. Args: phoneme_dict (str, Path, Dict): Path to file in CMUdict format or dictionary of CMUdict-like entries. word_tokenize_func: Function for tokenizing text to words. It has to return List[Tuple[Union[str, List[str]], bool]] where every tuple denotes word representation and flag whether to leave unchanged or not. It is expected that unchangeable word representation will be represented as List[str], other cases are represented as str. It is useful to mark word as unchangeable which is already in phoneme representation. apply_to_oov_word: Function that will be applied to out of phoneme_dict word. ignore_ambiguous_words: Whether to not handle word via phoneme_dict with ambiguous phoneme sequences. Defaults to True. heteronyms (str, Path, List): Path to file with heteronyms (every line is new word) or list of words. encoding: Encoding type. phoneme_probability (Optional[float]): The probability (0. self.phoneme_probability: return word, True # punctuation or whitespace. if re.search(r"[a-zA-ZÀ-ÿ\d]", word) is None: return list(word), True # heteronyms if self.heteronyms is not None and word in self.heteronyms: return word, True # `'s` suffix if ( len(word) > 2 and word.endswith("'s") and (word not in self.phoneme_dict) and (word[:-2] in self.phoneme_dict) and (not self.ignore_ambiguous_words or self.is_unique_in_phoneme_dict(word[:-2])) ): return self.phoneme_dict[word[:-2]][0] + ["Z"], True # `s` suffix if ( len(word) > 1 and word.endswith("s") and (word not in self.phoneme_dict) and (word[:-1] in self.phoneme_dict) and (not self.ignore_ambiguous_words or self.is_unique_in_phoneme_dict(word[:-1])) ): return self.phoneme_dict[word[:-1]][0] + ["Z"], True # phoneme dict if word in self.phoneme_dict and (not self.ignore_ambiguous_words or self.is_unique_in_phoneme_dict(word)): return self.phoneme_dict[word][0], True if self.apply_to_oov_word is not None: return self.apply_to_oov_word(word), True else: return word, False def __call__(self, text): words = self.word_tokenize_func(text) prons = [] for word, without_changes in words: if without_changes: prons.extend(word) continue word_str = word[0] word_by_hyphen = word_str.split("-") pron, is_handled = self.parse_one_word(word_str) if not is_handled and len(word_by_hyphen) > 1: pron = [] for sub_word in word_by_hyphen: p, _ = self.parse_one_word(sub_word) pron.extend(p) pron.extend(["-"]) pron.pop() prons.extend(pron) return prons @experimental class IPAG2P(BaseG2p): # fmt: off STRESS_SYMBOLS = ["ˈ", "ˌ"] # Regex for roman characters, accented characters, and locale-agnostic numbers/digits CHAR_REGEX = re.compile(fr"[{LATIN_CHARS_ALL}\d]") PUNCT_REGEX = re.compile(fr"[^{LATIN_CHARS_ALL}\d]") # fmt: on def __init__( self, phoneme_dict: Union[str, pathlib.Path, dict], locale: str = "en-US", apply_to_oov_word: Optional[Callable[[str], str]] = None, ignore_ambiguous_words: bool = True, heteronyms: Optional[Union[str, pathlib.Path, List[str]]] = None, use_chars: bool = False, phoneme_probability: Optional[float] = None, use_stresses: Optional[bool] = True, grapheme_case: Optional[str] = GRAPHEME_CASE_UPPER, grapheme_prefix: Optional[str] = "", mapping_file: Optional[str] = None, ) -> None: """ Generic IPA G2P module. This module converts words from graphemes to International Phonetic Alphabet representations. Optionally, it can ignore heteronyms, ambiguous words, or words marked as unchangeable by `word_tokenize_func` (see code for details). Ignored words are left unchanged or passed through `apply_to_oov_word` for handling. Args: phoneme_dict (str, Path, or Dict): Path to file in CMUdict format or an IPA dict object with CMUdict-like entries. For example, a dictionary file: scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.06.txt; a dictionary object: {..., "Wire": [["ˈ", "w", "a", "ɪ", "ɚ"], ["ˈ", "w", "a", "ɪ", "ɹ"]], ...}. locale (str): Locale used to determine a locale-specific tokenization logic. Currently, it supports "en-US", "de-DE", and "es-ES". Defaults to "en-US". Specify None if implementing custom logic for a new locale. apply_to_oov_word (Callable): Function that deals with the out-of-vocabulary (OOV) words that do not exist in the `phoneme_dict`. ignore_ambiguous_words (bool): Whether to handle word via phoneme_dict with ambiguous phoneme sequences. Defaults to True. heteronyms (str, Path, List[str]): Path to file that includes heteronyms (one word entry per line), or a list of words. use_chars (bool): Whether to include chars/graphemes in the token list. It is True if `phoneme_probability` is not None or if `apply_to_oov_word` function ever returns graphemes. phoneme_probability (Optional[float]): The probability (0.0 <= ε <= 1.0) that is used to balance the action that a word in a sentence is whether transliterated into a sequence of phonemes, or kept as a sequence of graphemes. If a random number for a word is greater than ε, then the word is kept as graphemes; otherwise, the word is transliterated as phonemes. Defaults to None which is equivalent to setting it to 1.0, meaning always transliterating the word into phonemes. Note that this code path is only run if the word can be transliterated into phonemes, otherwise, if a word does not have an entry in the g2p dict, it will be kept as graphemes. If a word has multiple pronunciations as shown in the g2p dict and `ignore_ambiguous_words` is True, it will be kept as graphemes as well. use_stresses (Optional[bool]): Whether to include the stress symbols (ˈ and ˌ). grapheme_case (Optional[str]): Trigger converting all graphemes to uppercase, lowercase, or keeping them as original mix-cases. You may want to use this feature to distinguish the grapheme set from the phoneme set if there is an overlap in between. Defaults to `upper` because phoneme set only uses lowercase symbols. You could explicitly prepend `grapheme_prefix` to distinguish them. grapheme_prefix (Optional[str]): Prepend a special symbol to any graphemes in order to distinguish graphemes from phonemes because there may be overlaps between the two set. It is suggested to choose a prefix that is not used or preserved somewhere else. "#" could be a good candidate. Default to "". TODO @borisfom: add docstring for newly added `mapping_file` argument. """ self.use_stresses = use_stresses self.grapheme_case = grapheme_case self.grapheme_prefix = grapheme_prefix self.phoneme_probability = phoneme_probability self.locale = locale self._rng = random.Random() if locale is not None: validate_locale(locale) if not use_chars and self.phoneme_probability is not None: self.use_chars = True logging.warning( "phoneme_probability was not None, characters will be enabled even though " "use_chars was set to False." ) else: self.use_chars = use_chars phoneme_dict_obj = self._parse_phoneme_dict(phoneme_dict) # verify if phoneme dict obj is empty if phoneme_dict_obj: self.phoneme_dict, self.symbols = self._normalize_dict(phoneme_dict_obj) else: raise ValueError(f"{phoneme_dict} contains no entries!") if apply_to_oov_word is None: logging.warning( "apply_to_oov_word=None, This means that some of words will remain unchanged " "if they are not handled by any of the rules in self.parse_one_word(). " "This may be intended if phonemes and chars are both valid inputs, otherwise, " "you may see unexpected deletions in your input." ) # word_tokenize_func returns a List[Tuple[List[str], bool]] where every tuple denotes # a word representation (a list tokens) and a flag indicating whether to process the word or # leave it unchanged. if locale == "en-US": word_tokenize_func = english_word_tokenize else: word_tokenize_func = any_locale_word_tokenize super().__init__( phoneme_dict=self.phoneme_dict, word_tokenize_func=word_tokenize_func, apply_to_oov_word=apply_to_oov_word, mapping_file=mapping_file, ) self.ignore_ambiguous_words = ignore_ambiguous_words if isinstance(heteronyms, str) or isinstance(heteronyms, pathlib.Path): self.heteronyms = set(self._parse_file_by_lines(heteronyms)) elif isinstance(heteronyms, list) and all(isinstance(het, str) for het in heteronyms): self.heteronyms = set(heteronyms) else: self.heteronyms = None if self.heteronyms: self.heteronyms = {set_grapheme_case(het, case=self.grapheme_case) for het in self.heteronyms} @staticmethod def _parse_phoneme_dict( phoneme_dict: Union[str, pathlib.Path, Dict[str, List[List[str]]]] ) -> Dict[str, List[List[str]]]: """ parse an input IPA dictionary and save it as a dict object. Args: phoneme_dict (Union[str, pathlib.Path, dict]): Path to file in CMUdict format or an IPA dict object with CMUdict-like entries. For example, a dictionary file: scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.06.txt; a dictionary object: {..., "Wire": [["ˈ", "w", "a", "ɪ", "ɚ"], ["ˈ", "w", "a", "ɪ", "ɹ"]], ...}. Returns: a dict object (Dict[str, List[List[str]]]). """ if isinstance(phoneme_dict, str) or isinstance(phoneme_dict, pathlib.Path): # load the dictionary file where there may exist a digit suffix after a word, e.g. "Word(2)", which # represents the pronunciation variant of that word. phoneme_dict_obj = defaultdict(list) _alt_re = re.compile(r"\([0-9]+\)") with open(phoneme_dict, "r", encoding="utf-8") as fdict: for line in fdict: # skip the empty lines if len(line) == 0: continue # Note that latin character pattern should be consistent with # nemo.collections.common.tokenizers.text_to_speech.tokenizer_utils.LATIN_CHARS_ALL. It is advised # to extend its character coverage if adding the support of new languages. # TODO @xueyang: unify hardcoded range of characters with LATIN_CHARS_ALL to avoid duplicates. line = normalize_unicode_text(line) if ( 'A' <= line[0] <= 'Z' or 'a' <= line[0] <= 'z' or 'À' <= line[0] <= 'Ö' or 'Ø' <= line[0] <= 'ö' or 'ø' <= line[0] <= 'ÿ' or line[0] == "'" ): parts = line.strip().split(maxsplit=1) word = re.sub(_alt_re, "", parts[0]) prons = re.sub(r"\s+", "", parts[1]) phoneme_dict_obj[word].append(list(prons)) else: # Load phoneme_dict as dictionary object logging.info("Loading phoneme_dict as a Dict object, and validating its entry format.") phoneme_dict_obj = {} for word, prons in phoneme_dict.items(): # validate dict entry format assert isinstance( prons, list ), f"Pronunciation type <{type(prons)}> is not supported. Please convert to ." # normalize word with NFC form word = normalize_unicode_text(word) # normalize phonemes with NFC form prons = [[normalize_unicode_text(p) for p in pron] for pron in prons] phoneme_dict_obj.update({word: prons}) return phoneme_dict_obj def replace_dict(self, phoneme_dict: Union[str, pathlib.Path, Dict[str, List[List[str]]]]): """ Replace model's phoneme dictionary with a custom one """ self.phoneme_dict = self._parse_phoneme_dict(phoneme_dict) @staticmethod def _parse_file_by_lines(p: Union[str, pathlib.Path]) -> List[str]: with open(p, 'r') as f: return [line.rstrip() for line in f.readlines()] def _prepend_prefix_for_one_word(self, word: str) -> List[str]: return [f"{self.grapheme_prefix}{character}" for character in word] def _normalize_dict(self, phoneme_dict_obj: Dict[str, List[List[str]]]) -> Tuple[Dict[str, List[List[str]]], Set]: """ Parse a python dict object according to the decision on word cases and removal of lexical stress markers. Args: phoneme_dict_obj (Dict[str, List[List[str]]]): a dictionary object. e.g. {..., "Wire": [["ˈ", "w", "a", "ɪ", "ɚ"], ["ˈ", "w", "a", "ɪ", "ɹ"]], ...} Returns: g2p_dict (dict): processed dict. symbols (set): a IPA phoneme set, or its union with grapheme set. """ g2p_dict = defaultdict(list) symbols = set() for word, prons in phoneme_dict_obj.items(): # process word # update word cases. word_new = set_grapheme_case(word, case=self.grapheme_case) # add grapheme symbols if `use_chars=True`. if self.use_chars: # remove punctuations within a word. Punctuations can exist at the start, middle, and end of a word. word_no_punct = self.PUNCT_REGEX.sub('', word_new) # add prefix to distinguish graphemes from phonemes. symbols.update(self._prepend_prefix_for_one_word(word_no_punct)) # process IPA pronunciations # update phoneme symbols by removing lexical stress markers if `use_stresses=False`. prons_new = list() if not self.use_stresses: for pron in prons: prons_new.append([symbol for symbol in pron if symbol not in self.STRESS_SYMBOLS]) else: prons_new = prons # update symbols for pron in prons_new: symbols.update(pron) # This will insert each char individually # update dict entry g2p_dict[word_new] = prons_new # duplicate word entries if grapheme_case is mixed. Even though grapheme_case is set to mixed, the words in # the original input text and the g2p_dict remain unchanged, so they could still be either lowercase, # uppercase, or mixed-case as defined in `set_grapheme_case` func. When mapping an uppercase word, e.g. # "HELLO", into phonemes using the g2p_dict with {"Hello": [["həˈɫoʊ"]]}, "HELLO" can't find its # pronunciations in the g2p_dict due to the case-mismatch of the words. Augmenting the g2p_dict with its # uppercase word entry, e.g. {"Hello": [["həˈɫoʊ"]], "HELLO": [["həˈɫoʊ"]]} would provide possibility to # find "HELLO"'s pronunciations rather than directly considering it as an OOV. if self.grapheme_case == GRAPHEME_CASE_MIXED and not word_new.isupper(): g2p_dict[word_new.upper()] = prons_new return g2p_dict, symbols def is_unique_in_phoneme_dict(self, word: str) -> bool: return len(self.phoneme_dict[word]) == 1 def parse_one_word(self, word: str) -> Tuple[List[str], bool]: """Returns parsed `word` and `status` (bool: False if word wasn't handled, True otherwise). """ word = set_grapheme_case(word, case=self.grapheme_case) # Punctuation (assumes other chars have been stripped) if self.CHAR_REGEX.search(word) is None: return list(word), True # Keep graphemes of a word with a probability. if self.phoneme_probability is not None and self._rng.random() > self.phoneme_probability: return self._prepend_prefix_for_one_word(word), True # Heteronyms if self.heteronyms and word in self.heteronyms: return self._prepend_prefix_for_one_word(word), True # special cases for en-US when transliterating a word into a list of phonemes. # TODO @xueyang: add special cases for any other languages upon new findings. if self.locale == "en-US": # `'s` suffix (with apostrophe) - not in phoneme dict if len(word) > 2 and (word.endswith("'s") or word.endswith("'S")): word_found = None if (word not in self.phoneme_dict) and (word.upper() not in self.phoneme_dict): if word[:-2] in self.phoneme_dict: word_found = word[:-2] elif word[:-2].upper() in self.phoneme_dict: word_found = word[:-2].upper() if word_found is not None and ( not self.ignore_ambiguous_words or self.is_unique_in_phoneme_dict(word_found) ): if word_found[-1] in ['T', 't']: # for example, "airport's" doesn't exist in the dict while "airport" exists. So append a phoneme # /s/ at the end of "airport"'s first pronunciation. return self.phoneme_dict[word_found][0] + ["s"], True elif word_found[-1] in ['S', 's']: # for example, "jones's" doesn't exist in the dict while "jones" exists. So append two phonemes, # /ɪ/ and /z/ at the end of "jones"'s first pronunciation. return self.phoneme_dict[word_found][0] + ["ɪ", "z"], True else: return self.phoneme_dict[word_found][0] + ["z"], True # `s` suffix (without apostrophe) - not in phoneme dict if len(word) > 1 and (word.endswith("s") or word.endswith("S")): word_found = None if (word not in self.phoneme_dict) and (word.upper() not in self.phoneme_dict): if word[:-1] in self.phoneme_dict: word_found = word[:-1] elif word[:-1].upper() in self.phoneme_dict: word_found = word[:-1].upper() if word_found is not None and ( not self.ignore_ambiguous_words or self.is_unique_in_phoneme_dict(word_found) ): if word_found[-1] in ['T', 't']: # for example, "airports" doesn't exist in the dict while "airport" exists. So append a phoneme # /s/ at the end of "airport"'s first pronunciation. return self.phoneme_dict[word_found][0] + ["s"], True else: return self.phoneme_dict[word_found][0] + ["z"], True # For the words that have a single pronunciation, directly look it up in the phoneme_dict; for the # words that have multiple pronunciation variants, if we don't want to ignore them, then directly choose their # first pronunciation variant as the target phonemes. # TODO @xueyang: this is a temporary solution, but it is not optimal if always choosing the first pronunciation # variant as the target if a word has multiple pronunciation variants. We need explore better approach to # select its optimal pronunciation variant aligning with its reference audio. if word in self.phoneme_dict and (not self.ignore_ambiguous_words or self.is_unique_in_phoneme_dict(word)): return self.phoneme_dict[word][0], True if ( self.grapheme_case == GRAPHEME_CASE_MIXED and word not in self.phoneme_dict and word.upper() in self.phoneme_dict ): word = word.upper() if not self.ignore_ambiguous_words or self.is_unique_in_phoneme_dict(word): return self.phoneme_dict[word][0], True if self.apply_to_oov_word is not None: return self.apply_to_oov_word(word), True else: return self._prepend_prefix_for_one_word(word), False def __call__(self, text: str) -> List[str]: text = normalize_unicode_text(text) if self.heteronym_model is not None: try: text = self.heteronym_model.disambiguate(sentences=[text])[1][0] except Exception as e: logging.warning(f"Heteronym model failed {e}, skipping") words_list_of_tuple = self.word_tokenize_func(text) prons = [] for words, without_changes in words_list_of_tuple: if without_changes: # for example: (["NVIDIA", "unchanged"], True). "NVIDIA" is considered as a single token. prons.extend([f"{self.grapheme_prefix}{word}" for word in words]) else: assert ( len(words) == 1 ), f"{words} should only have a single item when `without_changes` is False, but found {len(words)}." word = words[0] pron, is_handled = self.parse_one_word(word) # If `is_handled` is False, then the only possible case is that the word is an OOV. The OOV may have a # hyphen so that it doesn't show up in the g2p dictionary. We need split it into sub-words by a hyphen, # and parse the sub-words again just in case any sub-word exists in the g2p dictionary. if not is_handled: subwords_by_hyphen = word.split("-") if len(subwords_by_hyphen) > 1: pron = [] # reset the previous pron for sub_word in subwords_by_hyphen: p, _ = self.parse_one_word(sub_word) pron.extend(p) pron.append("-") pron.pop() # remove the redundant hyphen that is previously appended at the end of the word. prons.extend(pron) return prons class ChineseG2p(BaseG2p): def __init__( self, phoneme_dict=None, word_tokenize_func=None, apply_to_oov_word=None, mapping_file: Optional[str] = None, word_segmenter: Optional[str] = None, ): """Chinese G2P module. This module first converts Chinese characters into pinyin sequences using pypinyin, then pinyin sequences would be further converted into phoneme sequences using pinyin_dict_nv_22.10.txt dict file. For Chinese and English bilingual sentences, the English words would be converted into letters. Args: phoneme_dict (str, Path, Dict): Path to pinyin_dict_nv_22.10.txt dict file. word_tokenize_func: Function for tokenizing text to words. It has to return List[Tuple[Union[str, List[str]], bool]] where every tuple denotes word representation and flag whether to leave unchanged or not. It is expected that unchangeable word representation will be represented as List[str], other cases are represented as str. It is useful to mark word as unchangeable which is already in phoneme representation. apply_to_oov_word: Function that will be applied to out of phoneme_dict word. word_segmenter: method that will be applied to segment utterances into words for better polyphone disambiguation. """ assert phoneme_dict is not None, "Please set the phoneme_dict path." assert word_segmenter in [ None, 'jieba', ], f"{word_segmenter} is not supported now. Please choose correct word_segmenter." phoneme_dict = ( self._parse_as_pinyin_dict(phoneme_dict) if isinstance(phoneme_dict, str) or isinstance(phoneme_dict, pathlib.Path) else phoneme_dict ) if apply_to_oov_word is None: logging.warning( "apply_to_oov_word=None, This means that some of words will remain unchanged " "if they are not handled by any of the rules in self.parse_one_word(). " "This may be intended if phonemes and chars are both valid inputs, otherwise, " "you may see unexpected deletions in your input." ) super().__init__( phoneme_dict=phoneme_dict, word_tokenize_func=word_tokenize_func, apply_to_oov_word=apply_to_oov_word, mapping_file=mapping_file, ) self.tones = {'1': '#1', '2': '#2', '3': '#3', '4': '#4', '5': '#5'} if word_segmenter == "jieba": try: import jieba except ImportError as e: logging.error(e) # Cut sentences into words to improve polyphone disambiguation self.word_segmenter = jieba.cut else: self.word_segmenter = lambda x: [x] try: from pypinyin import Style, lazy_pinyin from pypinyin_dict.pinyin_data import cc_cedict except ImportError as e: logging.error(e) # replace pypinyin default dict with cc_cedict.txt for polyphone disambiguation cc_cedict.load() self._lazy_pinyin = lazy_pinyin self._Style = Style @staticmethod def _parse_as_pinyin_dict(phoneme_dict_path): """Loads pinyin dict file, and generates a set of all valid symbols.""" g2p_dict = defaultdict(list) with open(phoneme_dict_path, 'r') as file: for line in file: parts = line.split('\t') # let the key be lowercased, since pypinyin would give lower representation pinyin = parts[0].lower() pronunciation = parts[1].split() pronunciation_with_sharp = ['#' + pron for pron in pronunciation] g2p_dict[pinyin] = pronunciation_with_sharp return g2p_dict def __call__(self, text): """ errors func handle below is to process the bilingual situation, where English words would be split into letters. e.g. 我今天去了Apple Store, 买了一个iPhone。 would return a list ['wo3', 'jin1', 'tian1', 'qu4', 'le5', 'A', 'p', 'p', 'l', 'e', ' ', 'S', 't', 'o', 'r', 'e', ',', ' ', 'mai3', 'le5', 'yi2', 'ge4', 'i', 'P', 'h', 'o', 'n', 'e', '。'] """ pinyin_seq = [] words_list = self.word_segmenter(text) for word in words_list: pinyin_seq += self._lazy_pinyin( word, style=self._Style.TONE3, neutral_tone_with_five=True, errors=lambda en_words: [letter for letter in en_words], ) phoneme_seq = [] for pinyin in pinyin_seq: if pinyin[-1] in self.tones: assert pinyin[:-1] in self.phoneme_dict, pinyin[:-1] phoneme_seq += self.phoneme_dict[pinyin[:-1]] phoneme_seq.append(self.tones[pinyin[-1]]) # All pinyin would end up with a number in 1-5, which represents tones of the pinyin. # For symbols which are not pinyin, e.g. English letters, Chinese puncts, we directly # use them as inputs. else: phoneme_seq.append(pinyin) return phoneme_seq