from typing import Dict, Any,Union import tempfile import numpy as np import torch import pyewts import noisereduce as nr from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan from num2tib.core import convert from num2tib.core import convert2text import soundfile as sf import base64 import re import requests import os from pydub import AudioSegment def increase_volume_without_distortion(audio_data, sample_rate, target_dBFS): # Create an AudioSegment from raw audio data audio_segment = AudioSegment( audio_data.tobytes(), frame_rate=sample_rate, sample_width=audio_data.dtype.itemsize, channels=1 # or 2 for stereo ) # Normalize the audio level change_in_dBFS = target_dBFS - audio_segment.dBFS normalized_audio = audio_segment.apply_gain(change_in_dBFS) # Convert the AudioSegment back to a numpy array normalized_audio_data = np.array(normalized_audio.get_array_of_samples()).astype(np.int16) return normalized_audio_data converter = pyewts.pyewts() def download_file(url, destination): response = requests.get(url) with open(destination, 'wb') as file: file.write(response.content) # Example usage: download_file('https://huggingface.co/openpecha/speecht5-tts-01/resolve/main/female_2.npy', 'female_2.npy') def replace_numbers_with_convert(sentence, wylie=True): pattern = r'\d+(\.\d+)?' def replace(match): return convert(match.group(), wylie) result = re.sub(pattern, replace, sentence) return result def cleanup_text(inputs): for src, dst in replacements: inputs = inputs.replace(src, dst) return inputs speaker_embeddings = { "Lhasa(female)": "female_2.npy", } replacements = [ ('_', '_'), ('*', 'v'), ('`', ';'), ('~', ','), ('+', ','), ('\\', ';'), ('|', ';'), ('╚',''), ('╗','') ] class EndpointHandler(): def __init__(self, path=""): # load the model self.processor = SpeechT5Processor.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b") self.model = SpeechT5ForTextToSpeech.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b") self.model.to('cuda') self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") def __call__(self, data: Dict[str, Any]) -> Dict[str, Union[int, str]]: """_summary_ Args: data (Dict[str, Any]): _description_ Returns: bytes: _description_ """ text = data.pop("inputs",data) # process input if len(text.strip()) == 0: return (16000, np.zeros(0).astype(np.int16)) text = converter.toWylie(text) text=cleanup_text(text) text=replace_numbers_with_convert(text) inputs = self.processor(text=text, return_tensors="pt") input_ids = inputs["input_ids"] input_ids = input_ids[..., :self.model.config.max_text_positions] speaker_embedding = np.load(speaker_embeddings['Lhasa(female)']) speaker_embedding = torch.tensor(speaker_embedding) speech = self.model.generate_speech(input_ids.to('cuda'), speaker_embedding.to('cuda'), vocoder=self.vocoder.to('cuda')) speech = nr.reduce_noise(y=speech.to('cpu'), sr=16000) if isinstance(speech, torch.Tensor): speech = speech.numpy() # Increase volume without distortion target_dBFS = -20.0 # Adjust the value according to your requirement speech = increase_volume_without_distortion(speech, 16000, target_dBFS) # Create a unique temporary WAV file with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_wav_file: temp_wav_path = temp_wav_file.name sf.write(temp_wav_path, speech, 16000, 'PCM_24') # Use sf.write to write the WAV file # Read the WAV file and encode it as base64 with open(temp_wav_path, "rb") as wav_file: audio_base64 = base64.b64encode(wav_file.read()).decode("utf-8") # Clean up the temporary WAV file os.remove(temp_wav_path) return { "sample_rate": 16000, "audio_base64": audio_base64, # Base64-encoded audio data }