MrunangG's picture
Fix: Specify Speaker for multi-speaker TTS model
8edb695
raw
history blame
3.89 kB
# RAG/app.py (Corrected TTS Speaker)
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from peft import PeftModel
from TTS.api import TTS
import tempfile
import os
# --- 1. Configuration ---
BASE_MODEL_ID = "microsoft/phi-2"
PEFT_MODEL_ID = "MrunangG/phi-2-mbux-assistant"
STT_MODEL_ID = "openai/whisper-base.en"
TTS_MODEL_ID = "tts_models/en/vctk/vits"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# --- 2. Model Loading ---
print(f"--- Loading all models on device: {DEVICE} ---")
# ... (Model loading code is the same) ...
model_kwargs = {"trust_remote_code": True}
if DEVICE == "cuda":
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16)
model_kwargs["quantization_config"] = quantization_config
model_kwargs["torch_dtype"] = torch.float16
else:
model_kwargs["torch_dtype"] = torch.float32
base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_ID, **model_kwargs)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)
if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
model = PeftModel.from_pretrained(base_model, PEFT_MODEL_ID)
print("βœ… Language Model loaded successfully!")
stt_pipe = pipeline("automatic-speech-recognition", model=STT_MODEL_ID, device=DEVICE)
print("βœ… Speech-to-Text Model loaded successfully!")
gpu_enabled = True if DEVICE == "cuda" else False
tts = TTS(TTS_MODEL_ID, gpu=gpu_enabled)
print("βœ… Text-to-Speech Model loaded successfully!")
# --- 3. Full Voice-to-Voice Pipeline ---
def voice_assistant(audio_filepath):
"""
Takes a filepath to an audio recording, transcribes it, gets a response from the LLM,
and converts that response back to speech.
"""
if audio_filepath is None:
return None
# 1. Transcribe Audio to Text (STT)
transcription = stt_pipe(audio_filepath)["text"]
print(f"Transcription: '{transcription}'")
# 2. Generate Text Response (LLM)
formatted_prompt = f"[INST] {transcription} [/INST]"
inputs = tokenizer(formatted_prompt, return_tensors="pt").to(DEVICE)
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=50, repetition_penalty=1.15)
response_text = tokenizer.decode(outputs[0], skip_special_tokens=True).split('[/INST]')[-1].strip()
print(f"Generated Response: '{response_text}'")
# 3. Synthesize Speech from Text (TTS)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
# *** THIS IS THE FIX ***
# We must specify which speaker's voice to use from the VCTK model.
# 'p360' is just one of many available voices.
tts.tts_to_file(text=response_text, file_path=fp.name, speaker="p360")
return fp.name
# --- 4. Gradio UI Definition ---
# ... (The UI code is correct and does not need to change) ...
cpu_warning = "" if DEVICE == "cuda" else "<div style='background-color: #FFD2D2; color: #D8000C; padding: 10px; border-radius: 5px;'><strong>Warning:</strong> No GPU detected. Performance will be very slow.</div>"
article = f"{cpu_warning}<div style='text-align: center; margin-top: 20px;'><p>Fine-tuned model based on microsoft/phi-2. <a href='https://huggingface.co/{PEFT_MODEL_ID}' target='blank'>Model Repo</a></p></div>"
iface = gr.Interface(
fn=voice_assistant,
inputs=gr.Audio(sources=["microphone"], type="filepath", label="Speak your command..."),
outputs=gr.Audio(label="Response", autoplay=True),
title="πŸš— Voice-to-Voice MBUX Assistant",
description="Speak a command into your microphone, and the AI assistant will respond with voice.",
article=article,
theme=gr.themes.Soft()
)
# --- 5. App Launch ---
if __name__ == "__main__":
iface.launch(server_name="0.0.0.0", server_port=7860)