File size: 6,799 Bytes
78cb487 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 |
import torch
import numpy as np
from pathlib import Path
from src.models.models import build_model
from src.core.kokoro import generate
from .voice import split_into_sentences
class VoiceGenerator:
"""
A class to manage voice generation using a pre-trained model.
"""
def __init__(self, models_dir, voices_dir):
"""
Initializes the VoiceGenerator with model and voice directories.
Args:
models_dir (Path): Path to the directory containing model files.
voices_dir (Path): Path to the directory containing voice pack files.
"""
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model = None
self.voicepack = None
self.voice_name = None
self.models_dir = models_dir
self.voices_dir = voices_dir
self._initialized = False
def initialize(self, model_path, voice_name):
"""
Initializes the model and voice pack for audio generation.
Args:
model_path (str): The filename of the model.
voice_name (str): The name of the voice pack.
Returns:
str: A message indicating the voice has been loaded.
Raises:
FileNotFoundError: If the model or voice pack file is not found.
"""
model_file = self.models_dir / model_path
if not model_file.exists():
raise FileNotFoundError(
f"Model file not found at {model_file}. Please place the model file in the 'models' directory."
)
self.model = build_model(str(model_file), self.device)
self.voice_name = voice_name
voice_path = self.voices_dir / f"{voice_name}.pt"
if not voice_path.exists():
raise FileNotFoundError(
f"Voice pack not found at {voice_path}. Please place voice files in the 'data/voices' directory."
)
self.voicepack = torch.load(voice_path, weights_only=True).to(self.device)
self._initialized = True
return f"Loaded voice: {voice_name}"
def list_available_voices(self):
"""
Lists all available voice packs in the voices directory.
Returns:
list: A list of voice pack names (without the .pt extension).
"""
if not self.voices_dir.exists():
return []
return [f.stem for f in self.voices_dir.glob("*.pt")]
def is_initialized(self):
"""
Checks if the generator is properly initialized.
Returns:
bool: True if the model and voice pack are loaded, False otherwise.
"""
return (
self._initialized and self.model is not None and self.voicepack is not None
)
def generate(
self,
text,
lang=None,
speed=1.0,
pause_duration=4000,
short_text_limit=200,
return_chunks=False,
):
"""
Generates speech from the given text.
Handles both short and long-form text by splitting long text into sentences.
Args:
text (str): The text to generate speech from.
lang (str, optional): The language of the text. Defaults to None.
speed (float, optional): The speed of speech generation. Defaults to 1.0.
pause_duration (int, optional): The duration of pause between sentences in milliseconds. Defaults to 4000.
short_text_limit (int, optional): The character limit for considering text as short. Defaults to 200.
return_chunks (bool, optional): If True, returns a list of audio chunks instead of concatenated audio. Defaults to False.
Returns:
tuple: A tuple containing the generated audio (numpy array or list of numpy arrays) and a list of phonemes.
Raises:
RuntimeError: If the model is not initialized.
ValueError: If there is an error during audio generation.
"""
if not self.is_initialized():
raise RuntimeError("Model not initialized. Call initialize() first.")
if lang is None:
lang = self.voice_name[0]
text = text.strip()
if not text:
return (None, []) if not return_chunks else ([], [])
try:
if len(text) < short_text_limit:
try:
audio, phonemes = generate(
self.model, text, self.voicepack, lang=lang, speed=speed
)
if audio is None or len(audio) == 0:
raise ValueError(f"Failed to generate audio for text: {text}")
return (
(audio, phonemes) if not return_chunks else ([audio], phonemes)
)
except Exception as e:
raise ValueError(
f"Error generating audio for text: {text}. Error: {str(e)}"
)
sentences = split_into_sentences(text)
if not sentences:
return (None, []) if not return_chunks else ([], [])
audio_segments = []
phonemes_list = []
failed_sentences = []
for i, sentence in enumerate(sentences):
if not sentence.strip():
continue
try:
if audio_segments and not return_chunks:
audio_segments.append(np.zeros(pause_duration))
audio, phonemes = generate(
self.model, sentence, self.voicepack, lang=lang, speed=speed
)
if audio is not None and len(audio) > 0:
audio_segments.append(audio)
phonemes_list.extend(phonemes)
else:
failed_sentences.append(
(i, sentence, "Generated audio is empty")
)
except Exception as e:
failed_sentences.append((i, sentence, str(e)))
continue
if failed_sentences:
error_msg = "\n".join(
[f"Sentence {i+1}: '{s}' - {e}" for i, s, e in failed_sentences]
)
raise ValueError(
f"Failed to generate audio for some sentences:\n{error_msg}"
)
if not audio_segments:
return (None, []) if not return_chunks else ([], [])
if return_chunks:
return audio_segments, phonemes_list
return np.concatenate(audio_segments), phonemes_list
except Exception as e:
raise ValueError(f"Error in audio generation: {str(e)}")
|