File size: 4,290 Bytes
e3092d1
75700af
1b84f28
 
 
 
 
 
 
4f18e92
e3092d1
1b84f28
4d2be3f
75700af
0db7ad9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b84f28
4d2be3f
 
 
 
 
 
 
1b84f28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e3092d1
37e5267
 
1b84f28
37e5267
 
 
 
1b84f28
61bebd5
37e5267
1b84f28
 
 
 
 
 
 
 
 
 
 
 
61bebd5
1b84f28
0db7ad9
 
 
 
 
 
 
 
 
4f18e92
75700af
 
5f5648d
4f18e92
 
75700af
 
4f18e92
 
75700af
4f18e92
e3092d1
 
4f18e92
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from typing import  Dict, Any,Union
import tempfile
import numpy as np
import torch
import pyewts
import noisereduce as nr
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from num2tib.core import convert
from num2tib.core import convert2text
import soundfile as sf
import base64
import re
import requests
import os
from pydub import AudioSegment
def increase_volume_without_distortion(audio_data, sample_rate, target_dBFS):
    # Create an AudioSegment from raw audio data
    audio_segment = AudioSegment(
        audio_data.tobytes(),
        frame_rate=sample_rate,
        sample_width=audio_data.dtype.itemsize,
        channels=1  # or 2 for stereo
    )

    # Normalize the audio level
    change_in_dBFS = target_dBFS - audio_segment.dBFS
    normalized_audio = audio_segment.apply_gain(change_in_dBFS)

    # Convert the AudioSegment back to a numpy array
    normalized_audio_data = np.array(normalized_audio.get_array_of_samples()).astype(np.int16)

    return normalized_audio_data
converter = pyewts.pyewts()
def download_file(url, destination):
    response = requests.get(url)
    with open(destination, 'wb') as file:
        file.write(response.content)

# Example usage:
download_file('https://huggingface.co/openpecha/speecht5-tts-01/resolve/main/female_2.npy', 'female_2.npy')
def replace_numbers_with_convert(sentence, wylie=True):
    pattern = r'\d+(\.\d+)?'
    def replace(match):
        return convert(match.group(), wylie)
    result = re.sub(pattern, replace, sentence)
    
    return result

def cleanup_text(inputs):
    for src, dst in replacements:
        inputs = inputs.replace(src, dst)
    return inputs

speaker_embeddings = {
    "Lhasa(female)": "female_2.npy",

}

replacements = [
    ('_', '_'),
    ('*', 'v'),
    ('`', ';'),
    ('~', ','),
    ('+', ','),
    ('\\', ';'),
    ('|', ';'),
    ('╚',''),
    ('╗','')
]

class EndpointHandler():
    def __init__(self, path=""):
        # load the model
        self.processor = SpeechT5Processor.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b")
        self.model = SpeechT5ForTextToSpeech.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b")
        self.model.to('cuda')
        self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")


    def __call__(self, data: Dict[str, Any]) -> Dict[str, Union[int, str]]:
        """_summary_

        Args:
            data (Dict[str, Any]): _description_

        Returns:
            bytes: _description_
        """
        text = data.pop("inputs",data)

        # process input

        if len(text.strip()) == 0:
            return (16000, np.zeros(0).astype(np.int16))
        text = converter.toWylie(text)
        text=cleanup_text(text)
        text=replace_numbers_with_convert(text)
        inputs = self.processor(text=text, return_tensors="pt")
        input_ids = inputs["input_ids"]
        input_ids = input_ids[..., :self.model.config.max_text_positions]
        speaker_embedding = np.load(speaker_embeddings['Lhasa(female)'])
        speaker_embedding = torch.tensor(speaker_embedding)
        speech = self.model.generate_speech(input_ids.to('cuda'), speaker_embedding.to('cuda'), vocoder=self.vocoder.to('cuda'))
        speech = nr.reduce_noise(y=speech.to('cpu'), sr=16000)
        if isinstance(speech, torch.Tensor):
            
            
            speech = speech.numpy()

    # Increase volume without distortion
        target_dBFS = -20.0  # Adjust the value according to your requirement
        speech = increase_volume_without_distortion(speech, 16000, target_dBFS)

        # Create a unique temporary WAV file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_wav_file:
            temp_wav_path = temp_wav_file.name
            sf.write(temp_wav_path, speech, 16000, 'PCM_24')  # Use sf.write to write the WAV file

        # Read the WAV file and encode it as base64
        with open(temp_wav_path, "rb") as wav_file:
            audio_base64 = base64.b64encode(wav_file.read()).decode("utf-8")

        # Clean up the temporary WAV file
        os.remove(temp_wav_path)

        return {
            "sample_rate": 16000,
            "audio_base64": audio_base64,  # Base64-encoded audio data
        }