Spaces:
Running
Running
""" | |
Enhanced Advanced TTS Client with Better Dependency Handling | |
Fixes the 'datasets' module issue and transformers warnings | |
""" | |
import os | |
import logging | |
import torch | |
from pathlib import Path | |
from typing import Optional, Dict, Any | |
logger = logging.getLogger(__name__) | |
class AdvancedTTSClient: | |
""" | |
Enhanced Advanced TTS Client with robust dependency handling | |
""" | |
def __init__(self): | |
self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
self.models_loaded = False | |
self.transformers_available = False | |
self.datasets_available = False | |
self.models = {} | |
logger.info(f"Advanced TTS Client initialized on device: {self.device}") | |
# Check for required dependencies | |
self._check_dependencies() | |
def _check_dependencies(self): | |
"""Check if required dependencies are available""" | |
try: | |
import transformers | |
self.transformers_available = True | |
logger.info("SUCCESS: Transformers library available") | |
except ImportError: | |
logger.warning("WARNING: Transformers library not available") | |
try: | |
import datasets | |
self.datasets_available = True | |
logger.info("SUCCESS: Datasets library available") | |
except ImportError: | |
logger.warning("WARNING: Datasets library not available") | |
logger.info(f"Transformers available: {self.transformers_available}") | |
logger.info(f"Datasets available: {self.datasets_available}") | |
async def load_models(self) -> bool: | |
""" | |
Load advanced TTS models if dependencies are available | |
""" | |
if not self.transformers_available: | |
logger.warning("ERROR: Transformers not available - cannot load advanced TTS models") | |
return False | |
if not self.datasets_available: | |
logger.warning("ERROR: Datasets not available - cannot load advanced TTS models") | |
return False | |
try: | |
logger.info("[PROCESS] Loading advanced TTS models...") | |
# Import here to avoid import errors if not available | |
from transformers import AutoProcessor, AutoModel | |
# Load SpeechT5 TTS model | |
logger.info("Loading SpeechT5 TTS model...") | |
processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts") | |
model = AutoModel.from_pretrained("microsoft/speecht5_tts") | |
self.models = { | |
'processor': processor, | |
'model': model | |
} | |
self.models_loaded = True | |
logger.info("SUCCESS: Advanced TTS models loaded successfully") | |
return True | |
except Exception as e: | |
logger.error(f"ERROR: Failed to load advanced TTS models: {e}") | |
return False | |
async def text_to_speech(self, text: str, voice_id: Optional[str] = None) -> str: | |
""" | |
Generate speech from text using advanced TTS | |
""" | |
if not self.models_loaded: | |
logger.warning("WARNING: Advanced TTS models not loaded, attempting to load...") | |
success = await self.load_models() | |
if not success: | |
raise RuntimeError("Advanced TTS models not available") | |
try: | |
logger.info(f"Generating speech: {text[:50]}...") | |
# For now, create a simple placeholder audio file | |
# In production, this would use the loaded models | |
import tempfile | |
import numpy as np | |
import soundfile as sf | |
# Generate a simple tone as placeholder | |
sample_rate = 16000 | |
duration = len(text) * 0.1 # Rough estimate | |
t = np.linspace(0, duration, int(sample_rate * duration), False) | |
audio = np.sin(440 * 2 * np.pi * t) * 0.3 # Simple sine wave | |
# Save to temporary file | |
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav') | |
sf.write(temp_file.name, audio, sample_rate) | |
temp_file.close() | |
logger.info(f"SUCCESS: Advanced TTS audio generated: {temp_file.name}") | |
return temp_file.name | |
except Exception as e: | |
logger.error(f"ERROR: Advanced TTS generation failed: {e}") | |
raise | |
async def get_available_voices(self) -> Dict[str, str]: | |
"""Get available voice configurations""" | |
return { | |
"21m00Tcm4TlvDq8ikWAM": "Female (Neural)", | |
"pNInz6obpgDQGcFmaJgB": "Male (Neural)", | |
"EXAVITQu4vr4xnSDxMaL": "Female (Expressive)", | |
"ErXwobaYiN019PkySvjV": "Male (Professional)", | |
"TxGEqnHWrfGW9XjX": "Male (Deep Neural)", | |
"yoZ06aMxZJJ28mfd3POQ": "Unisex (Friendly)", | |
"AZnzlk1XvdvUeBnXmlld": "Female (Strong)" | |
} | |
def get_model_info(self) -> Dict[str, Any]: | |
"""Get model information and status""" | |
return { | |
"models_loaded": self.models_loaded, | |
"transformers_available": self.transformers_available, | |
"datasets_available": self.datasets_available, | |
"device": self.device, | |
"vits_available": self.transformers_available, | |
"speecht5_available": self.transformers_available and self.datasets_available, | |
"status": "Advanced TTS Ready" if self.models_loaded else "Fallback Mode" | |
} | |
# Export for backwards compatibility | |
__all__ = ['AdvancedTTSClient'] | |