| import torch | |
| import numpy as np | |
| from pathlib import Path | |
| from src.models.models import build_model | |
| from src.core.kokoro import generate | |
| from .voice import split_into_sentences | |
| class VoiceGenerator: | |
| """ | |
| A class to manage voice generation using a pre-trained model. | |
| """ | |
| def __init__(self, models_dir, voices_dir): | |
| """ | |
| Initializes the VoiceGenerator with model and voice directories. | |
| Args: | |
| models_dir (Path): Path to the directory containing model files. | |
| voices_dir (Path): Path to the directory containing voice pack files. | |
| """ | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| self.model = None | |
| self.voicepack = None | |
| self.voice_name = None | |
| self.models_dir = models_dir | |
| self.voices_dir = voices_dir | |
| self._initialized = False | |
| def initialize(self, model_path, voice_name): | |
| """ | |
| Initializes the model and voice pack for audio generation. | |
| Args: | |
| model_path (str): The filename of the model. | |
| voice_name (str): The name of the voice pack. | |
| Returns: | |
| str: A message indicating the voice has been loaded. | |
| Raises: | |
| FileNotFoundError: If the model or voice pack file is not found. | |
| """ | |
| model_file = self.models_dir / model_path | |
| if not model_file.exists(): | |
| raise FileNotFoundError( | |
| f"Model file not found at {model_file}. Please place the model file in the 'models' directory." | |
| ) | |
| self.model = build_model(str(model_file), self.device) | |
| self.voice_name = voice_name | |
| voice_path = self.voices_dir / f"{voice_name}.pt" | |
| if not voice_path.exists(): | |
| raise FileNotFoundError( | |
| f"Voice pack not found at {voice_path}. Please place voice files in the 'data/voices' directory." | |
| ) | |
| self.voicepack = torch.load(voice_path, weights_only=True).to(self.device) | |
| self._initialized = True | |
| return f"Loaded voice: {voice_name}" | |
| def list_available_voices(self): | |
| """ | |
| Lists all available voice packs in the voices directory. | |
| Returns: | |
| list: A list of voice pack names (without the .pt extension). | |
| """ | |
| if not self.voices_dir.exists(): | |
| return [] | |
| return [f.stem for f in self.voices_dir.glob("*.pt")] | |
| def is_initialized(self): | |
| """ | |
| Checks if the generator is properly initialized. | |
| Returns: | |
| bool: True if the model and voice pack are loaded, False otherwise. | |
| """ | |
| return ( | |
| self._initialized and self.model is not None and self.voicepack is not None | |
| ) | |
| def generate( | |
| self, | |
| text, | |
| lang=None, | |
| speed=1.0, | |
| pause_duration=4000, | |
| short_text_limit=200, | |
| return_chunks=False, | |
| ): | |
| """ | |
| Generates speech from the given text. | |
| Handles both short and long-form text by splitting long text into sentences. | |
| Args: | |
| text (str): The text to generate speech from. | |
| lang (str, optional): The language of the text. Defaults to None. | |
| speed (float, optional): The speed of speech generation. Defaults to 1.0. | |
| pause_duration (int, optional): The duration of pause between sentences in milliseconds. Defaults to 4000. | |
| short_text_limit (int, optional): The character limit for considering text as short. Defaults to 200. | |
| return_chunks (bool, optional): If True, returns a list of audio chunks instead of concatenated audio. Defaults to False. | |
| Returns: | |
| tuple: A tuple containing the generated audio (numpy array or list of numpy arrays) and a list of phonemes. | |
| Raises: | |
| RuntimeError: If the model is not initialized. | |
| ValueError: If there is an error during audio generation. | |
| """ | |
| if not self.is_initialized(): | |
| raise RuntimeError("Model not initialized. Call initialize() first.") | |
| if lang is None: | |
| lang = self.voice_name[0] | |
| text = text.strip() | |
| if not text: | |
| return (None, []) if not return_chunks else ([], []) | |
| try: | |
| if len(text) < short_text_limit: | |
| try: | |
| audio, phonemes = generate( | |
| self.model, text, self.voicepack, lang=lang, speed=speed | |
| ) | |
| if audio is None or len(audio) == 0: | |
| raise ValueError(f"Failed to generate audio for text: {text}") | |
| return ( | |
| (audio, phonemes) if not return_chunks else ([audio], phonemes) | |
| ) | |
| except Exception as e: | |
| raise ValueError( | |
| f"Error generating audio for text: {text}. Error: {str(e)}" | |
| ) | |
| sentences = split_into_sentences(text) | |
| if not sentences: | |
| return (None, []) if not return_chunks else ([], []) | |
| audio_segments = [] | |
| phonemes_list = [] | |
| failed_sentences = [] | |
| for i, sentence in enumerate(sentences): | |
| if not sentence.strip(): | |
| continue | |
| try: | |
| if audio_segments and not return_chunks: | |
| audio_segments.append(np.zeros(pause_duration)) | |
| audio, phonemes = generate( | |
| self.model, sentence, self.voicepack, lang=lang, speed=speed | |
| ) | |
| if audio is not None and len(audio) > 0: | |
| audio_segments.append(audio) | |
| phonemes_list.extend(phonemes) | |
| else: | |
| failed_sentences.append( | |
| (i, sentence, "Generated audio is empty") | |
| ) | |
| except Exception as e: | |
| failed_sentences.append((i, sentence, str(e))) | |
| continue | |
| if failed_sentences: | |
| error_msg = "\n".join( | |
| [f"Sentence {i+1}: '{s}' - {e}" for i, s, e in failed_sentences] | |
| ) | |
| raise ValueError( | |
| f"Failed to generate audio for some sentences:\n{error_msg}" | |
| ) | |
| if not audio_segments: | |
| return (None, []) if not return_chunks else ([], []) | |
| if return_chunks: | |
| return audio_segments, phonemes_list | |
| return np.concatenate(audio_segments), phonemes_list | |
| except Exception as e: | |
| raise ValueError(f"Error in audio generation: {str(e)}") | |