File size: 6,799 Bytes

78cb487

import torch
import numpy as np
from pathlib import Path
from src.models.models import build_model
from src.core.kokoro import generate
from .voice import split_into_sentences


class VoiceGenerator:
    """
    A class to manage voice generation using a pre-trained model.
    """

    def __init__(self, models_dir, voices_dir):
        """
        Initializes the VoiceGenerator with model and voice directories.

        Args:
            models_dir (Path): Path to the directory containing model files.
            voices_dir (Path): Path to the directory containing voice pack files.
        """
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = None
        self.voicepack = None
        self.voice_name = None
        self.models_dir = models_dir
        self.voices_dir = voices_dir
        self._initialized = False

    def initialize(self, model_path, voice_name):
        """
        Initializes the model and voice pack for audio generation.

        Args:
            model_path (str): The filename of the model.
            voice_name (str): The name of the voice pack.

        Returns:
            str: A message indicating the voice has been loaded.

        Raises:
            FileNotFoundError: If the model or voice pack file is not found.
        """
        model_file = self.models_dir / model_path
        if not model_file.exists():
            raise FileNotFoundError(
                f"Model file not found at {model_file}. Please place the model file in the 'models' directory."
            )

        self.model = build_model(str(model_file), self.device)
        self.voice_name = voice_name

        voice_path = self.voices_dir / f"{voice_name}.pt"
        if not voice_path.exists():
            raise FileNotFoundError(
                f"Voice pack not found at {voice_path}. Please place voice files in the 'data/voices' directory."
            )

        self.voicepack = torch.load(voice_path, weights_only=True).to(self.device)
        self._initialized = True
        return f"Loaded voice: {voice_name}"

    def list_available_voices(self):
        """
        Lists all available voice packs in the voices directory.

        Returns:
            list: A list of voice pack names (without the .pt extension).
        """
        if not self.voices_dir.exists():
            return []
        return [f.stem for f in self.voices_dir.glob("*.pt")]

    def is_initialized(self):
        """
        Checks if the generator is properly initialized.

        Returns:
            bool: True if the model and voice pack are loaded, False otherwise.
        """
        return (
            self._initialized and self.model is not None and self.voicepack is not None
        )

    def generate(
        self,
        text,
        lang=None,
        speed=1.0,
        pause_duration=4000,
        short_text_limit=200,
        return_chunks=False,
    ):
        """
        Generates speech from the given text.

        Handles both short and long-form text by splitting long text into sentences.

        Args:
            text (str): The text to generate speech from.
            lang (str, optional): The language of the text. Defaults to None.
            speed (float, optional): The speed of speech generation. Defaults to 1.0.
            pause_duration (int, optional): The duration of pause between sentences in milliseconds. Defaults to 4000.
            short_text_limit (int, optional): The character limit for considering text as short. Defaults to 200.
            return_chunks (bool, optional): If True, returns a list of audio chunks instead of concatenated audio. Defaults to False.

        Returns:
            tuple: A tuple containing the generated audio (numpy array or list of numpy arrays) and a list of phonemes.

        Raises:
            RuntimeError: If the model is not initialized.
            ValueError: If there is an error during audio generation.
        """
        if not self.is_initialized():
            raise RuntimeError("Model not initialized. Call initialize() first.")

        if lang is None:
            lang = self.voice_name[0]

        text = text.strip()
        if not text:
            return (None, []) if not return_chunks else ([], [])

        try:
            if len(text) < short_text_limit:
                try:
                    audio, phonemes = generate(
                        self.model, text, self.voicepack, lang=lang, speed=speed
                    )
                    if audio is None or len(audio) == 0:
                        raise ValueError(f"Failed to generate audio for text: {text}")
                    return (
                        (audio, phonemes) if not return_chunks else ([audio], phonemes)
                    )
                except Exception as e:
                    raise ValueError(
                        f"Error generating audio for text: {text}. Error: {str(e)}"
                    )

            sentences = split_into_sentences(text)
            if not sentences:
                return (None, []) if not return_chunks else ([], [])

            audio_segments = []
            phonemes_list = []
            failed_sentences = []

            for i, sentence in enumerate(sentences):
                if not sentence.strip():
                    continue

                try:
                    if audio_segments and not return_chunks:
                        audio_segments.append(np.zeros(pause_duration))

                    audio, phonemes = generate(
                        self.model, sentence, self.voicepack, lang=lang, speed=speed
                    )
                    if audio is not None and len(audio) > 0:
                        audio_segments.append(audio)
                        phonemes_list.extend(phonemes)
                    else:
                        failed_sentences.append(
                            (i, sentence, "Generated audio is empty")
                        )
                except Exception as e:
                    failed_sentences.append((i, sentence, str(e)))
                    continue

            if failed_sentences:
                error_msg = "\n".join(
                    [f"Sentence {i+1}: '{s}' - {e}" for i, s, e in failed_sentences]
                )
                raise ValueError(
                    f"Failed to generate audio for some sentences:\n{error_msg}"
                )

            if not audio_segments:
                return (None, []) if not return_chunks else ([], [])

            if return_chunks:
                return audio_segments, phonemes_list
            return np.concatenate(audio_segments), phonemes_list

        except Exception as e:
            raise ValueError(f"Error in audio generation: {str(e)}")