Spaces:
Build error
Build error
| # ---------------------------------------------------------------------------- | |
| # SpeechLM: Enhanced Speech Pre-Training with Unpaired Textual Data (https://arxiv.org/abs/2209.15329) | |
| # Github source: https://github.com/microsoft/SpeechT5/tree/main/SpeechLM | |
| # Code based on fairseq: https://github.com/facebookresearch/fairseq/tree/272c4c5197250997148fb12c0db6306035f166a4 | |
| # | |
| # Copyright (c) 2022 Microsoft | |
| # Licensed under The MIT License [see LICENSE for details] | |
| # ---------------------------------------------------------------------------- | |
| """ | |
| Modified from https://github.com/facebookresearch/fairseq/tree/272c4c5197250997148fb12c0db6306035f166a4/examples/wav2vec/unsupervised/scripts/phonemize_with_sil.py | |
| """ | |
| import argparse | |
| import numpy as np | |
| import sys | |
| from g2p_en import G2p | |
| from tqdm import tqdm | |
| import logging | |
| logging.basicConfig( | |
| format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", | |
| datefmt="%Y-%m-%d %H:%M:%S", | |
| level=logging.INFO, | |
| ) | |
| logger = logging.getLogger(__name__) | |
| def get_parser(): | |
| parser = argparse.ArgumentParser( | |
| description="converts words to phones adding optional silences around in between words" | |
| ) | |
| parser.add_argument( | |
| "--sil-prob", | |
| "-s", | |
| type=float, | |
| default=0, | |
| help="probability of inserting silence between each word", | |
| ) | |
| parser.add_argument( | |
| "--surround", | |
| action="store_true", | |
| help="if set, surrounds each example with silence", | |
| ) | |
| parser.add_argument( | |
| "--lexicon", | |
| help="lexicon to convert to phones", | |
| required=True, | |
| ) | |
| parser.add_argument( | |
| "--strict", | |
| action="store_true", | |
| help="if set, OOV words will raise a error (for train/valid set)", | |
| ) | |
| parser.add_argument( | |
| "--input", | |
| "-i", | |
| help="input text file", | |
| required=True, | |
| ) | |
| parser.add_argument( | |
| "--output", | |
| "-o", | |
| help="input text file", | |
| required=True, | |
| ) | |
| return parser | |
| def normalize_phn(phons): | |
| """ | |
| convert g2p style phone to 39-phone set | |
| """ | |
| return [p.rstrip('0123456789') for p in phons] | |
| def main(): | |
| parser = get_parser() | |
| args = parser.parse_args() | |
| sil_prob = args.sil_prob | |
| surround = args.surround | |
| sil = "<SIL>" | |
| wrd_to_phn = {} | |
| g2p = G2p() | |
| with open(args.lexicon, "r") as lf: | |
| for line in lf: | |
| items = line.rstrip().split() | |
| assert len(items) > 1, line | |
| assert items[0] not in wrd_to_phn, items | |
| wrd_to_phn[items[0]] = items[1:] | |
| with open(args.input, "r") as fin, open(args.output, "w", encoding="utf-8") as fout: | |
| for line in tqdm(fin): | |
| words = line.strip().upper().split() | |
| if not all(w in wrd_to_phn for w in words): | |
| if args.strict: | |
| # logger.warning(f"| Warning: OOV words found: {line}") | |
| pass | |
| else: | |
| continue | |
| phones = [] | |
| if surround: | |
| phones.append(sil) | |
| sample_sil_probs = None | |
| if sil_prob > 0 and len(words) > 1: | |
| sample_sil_probs = np.random.random(len(words) - 1) | |
| for i, w in enumerate(words): | |
| if w in wrd_to_phn: | |
| phones.extend(wrd_to_phn[w]) | |
| else: | |
| phones.extend(normalize_phn(g2p(w))) | |
| if ( | |
| sample_sil_probs is not None | |
| and i < len(sample_sil_probs) | |
| and sample_sil_probs[i] < sil_prob | |
| ): | |
| phones.append(sil) | |
| if surround: | |
| phones.append(sil) | |
| print(" ".join(phones), file=fout) | |
| if __name__ == "__main__": | |
| main() | |