zijuncheng/sentence-to-phoneme

How to load the model and make inferences

Download all the files to a local directory model_dir

Load the model and the tokenizers

from transformers import BartForConditionalGeneration, BartConfig, BartTokenizer, PreTrainedTokenizerFast
from model_s2p import ConfigurableBART

config_s2p = BartConfig.from_pretrained(model_dir)
model_s2p = ConfigurableBART.from_pretrained(model_dir)

input_tokenizer_s2p = BartTokenizer.from_pretrained(model_dir + "/input_tokenizer")
output_tokenizer_s2p = PreTrainedTokenizerFast.from_pretrained(model_dir + "/output_tokenizer")

Inference

# add custom decoding logic
import re
def remove_intra_word_spaces(text):
    # Remove special tokens first (optional, if needed)
    text = text.replace("<s>", "").replace("</s>", "").strip()

    # Step 1: Split on 2+ spaces (which indicate word boundaries)
    words = re.split(r'\s{2,}', text)

    # Step 2: For each word, remove all single spaces (intra-word spacing)
    cleaned_words = [''.join(word.split()) for word in words]

    # Step 3: Join words back with a single space
    return ' '.join(cleaned_words)

# Custom decoding wrapper
def custom_decode(token_ids, tokenizer,**kwargs):
    decoded = tokenizer.decode(token_ids, **kwargs)
    return remove_intra_word_spaces(decoded)

Set seed and create inference function (single sentence per inference)

import random
import numpy as np
import torch

def set_seed(seed=42):
    torch.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

def model_inference(t,model,encoder_len,decoder_len,num_beams, input_tokenizer, output_tokenizer):
    set_seed(42)
    model.eval()
    inputs = input_tokenizer(t, return_tensors="pt", padding=True, truncation=True, max_length=encoder_len)
    with torch.no_grad():
        output_ids = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=decoder_len,
        num_beams=num_beams,         # You can adjust decoding strategy
        early_stopping=True
        )
    
    output_phonemes = custom_decode(output_ids[0], output_tokenizer, skip_special_tokens=False)
    return output_phonemes

The inference function:

text = "It's raining cats and dogs."
result = model_inference(text,model_s2p,128,512,5,input_tokenizer_s2p, output_tokenizer_s2p)

This should return:

'ˌɪts ɹˈAnɪŋ kˈæts ænd dˈɔɡz.'