|
import opuslib |
|
import asyncio |
|
from edge_tts import Communicate |
|
import soundfile as sf |
|
import io |
|
from pydub import AudioSegment |
|
import numpy as np |
|
|
|
|
|
class TtsUtility: |
|
def __init__(self, audio_config): |
|
self.audio_config = audio_config |
|
|
|
async def generate_tts(self, text: str) -> bytes: |
|
"""使用 Edge TTS 生成语音""" |
|
communicate = Communicate(text, "zh-CN-XiaoxiaoNeural") |
|
audio_data = b"" |
|
async for chunk in communicate.stream(): |
|
if chunk["type"] == "audio": |
|
audio_data += chunk["data"] |
|
return audio_data |
|
|
|
async def text_to_opus_audio(self, text: str) -> list: |
|
"""将文本转换为 Opus 音频""" |
|
|
|
|
|
audio_data = await self.generate_tts(text) |
|
|
|
try: |
|
|
|
audio = AudioSegment.from_mp3(io.BytesIO(audio_data)) |
|
|
|
|
|
audio = audio.set_frame_rate(self.audio_config.INPUT_SAMPLE_RATE) |
|
audio = audio.set_channels(self.audio_config.CHANNELS) |
|
|
|
wav_data = io.BytesIO() |
|
audio.export(wav_data, format='wav') |
|
wav_data.seek(0) |
|
|
|
|
|
data, samplerate = sf.read(wav_data) |
|
|
|
|
|
if data.dtype != np.int16: |
|
data = (data * 32767).astype(np.int16) |
|
|
|
|
|
raw_data = data.tobytes() |
|
|
|
except Exception as e: |
|
print(f"[ERROR] 音频转换失败: {e}") |
|
return None |
|
|
|
|
|
encoder = opuslib.Encoder( |
|
self.audio_config.INPUT_SAMPLE_RATE, |
|
self.audio_config.CHANNELS, |
|
opuslib.APPLICATION_VOIP |
|
) |
|
|
|
|
|
frame_size = self.audio_config.INPUT_FRAME_SIZE |
|
opus_frames = [] |
|
|
|
|
|
for i in range(0, len(raw_data), frame_size * 2): |
|
chunk = raw_data[i:i + frame_size * 2] |
|
if len(chunk) < frame_size * 2: |
|
|
|
chunk += b'\x00' * (frame_size * 2 - len(chunk)) |
|
opus_frame = encoder.encode(chunk, frame_size) |
|
opus_frames.append(opus_frame) |
|
|
|
return opus_frames |