speecht5-tts-01 / handler.py

Update handler.py

0db7ad9 about 1 year ago

4.29 kB

	from typing import Dict, Any,Union
	import tempfile
	import numpy as np
	import torch
	import pyewts
	import noisereduce as nr
	from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
	from num2tib.core import convert
	from num2tib.core import convert2text
	import soundfile as sf
	import base64
	import re
	import requests
	import os
	from pydub import AudioSegment
	def increase_volume_without_distortion(audio_data, sample_rate, target_dBFS):
	# Create an AudioSegment from raw audio data
	audio_segment = AudioSegment(
	audio_data.tobytes(),
	frame_rate=sample_rate,
	sample_width=audio_data.dtype.itemsize,
	channels=1 # or 2 for stereo
	)

	# Normalize the audio level
	change_in_dBFS = target_dBFS - audio_segment.dBFS
	normalized_audio = audio_segment.apply_gain(change_in_dBFS)

	# Convert the AudioSegment back to a numpy array
	normalized_audio_data = np.array(normalized_audio.get_array_of_samples()).astype(np.int16)

	return normalized_audio_data
	converter = pyewts.pyewts()
	def download_file(url, destination):
	response = requests.get(url)
	with open(destination, 'wb') as file:
	file.write(response.content)

	# Example usage:
	download_file('https://huggingface.co/openpecha/speecht5-tts-01/resolve/main/female_2.npy', 'female_2.npy')
	def replace_numbers_with_convert(sentence, wylie=True):
	pattern = r'\d+(\.\d+)?'
	def replace(match):
	return convert(match.group(), wylie)
	result = re.sub(pattern, replace, sentence)

	return result

	def cleanup_text(inputs):
	for src, dst in replacements:
	inputs = inputs.replace(src, dst)
	return inputs

	speaker_embeddings = {
	"Lhasa(female)": "female_2.npy",

	}

	replacements = [
	('_', '_'),
	('*', 'v'),
	('`', ';'),
	('~', ','),
	('+', ','),
	('\\', ';'),
	('\|', ';'),
	('╚',''),
	('╗','')
	]

	class EndpointHandler():
	def __init__(self, path=""):
	# load the model
	self.processor = SpeechT5Processor.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b")
	self.model = SpeechT5ForTextToSpeech.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b")
	self.model.to('cuda')
	self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")


	def __call__(self, data: Dict[str, Any]) -> Dict[str, Union[int, str]]:
	"""_summary_

	Args:
	data (Dict[str, Any]): _description_

	Returns:
	bytes: _description_
	"""
	text = data.pop("inputs",data)

	# process input

	if len(text.strip()) == 0:
	return (16000, np.zeros(0).astype(np.int16))
	text = converter.toWylie(text)
	text=cleanup_text(text)
	text=replace_numbers_with_convert(text)
	inputs = self.processor(text=text, return_tensors="pt")
	input_ids = inputs["input_ids"]
	input_ids = input_ids[..., :self.model.config.max_text_positions]
	speaker_embedding = np.load(speaker_embeddings['Lhasa(female)'])
	speaker_embedding = torch.tensor(speaker_embedding)
	speech = self.model.generate_speech(input_ids.to('cuda'), speaker_embedding.to('cuda'), vocoder=self.vocoder.to('cuda'))
	speech = nr.reduce_noise(y=speech.to('cpu'), sr=16000)
	if isinstance(speech, torch.Tensor):


	speech = speech.numpy()

	# Increase volume without distortion
	target_dBFS = -20.0 # Adjust the value according to your requirement
	speech = increase_volume_without_distortion(speech, 16000, target_dBFS)

	# Create a unique temporary WAV file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_wav_file:
	temp_wav_path = temp_wav_file.name
	sf.write(temp_wav_path, speech, 16000, 'PCM_24') # Use sf.write to write the WAV file

	# Read the WAV file and encode it as base64
	with open(temp_wav_path, "rb") as wav_file:
	audio_base64 = base64.b64encode(wav_file.read()).decode("utf-8")

	# Clean up the temporary WAV file
	os.remove(temp_wav_path)

	return {
	"sample_rate": 16000,
	"audio_base64": audio_base64, # Base64-encoded audio data
	}