Spaces:

MrunangG
/

mbux-gradio-demo

Running

App Files Files Community

mbux-gradio-demo / app.py

MrunangG

Fix: Specify Speaker for multi-speaker TTS model

8edb695 10 days ago

raw

history blame

3.89 kB

	# RAG/app.py (Corrected TTS Speaker)
	import gradio as gr
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
	from peft import PeftModel
	from TTS.api import TTS
	import tempfile
	import os

	# --- 1. Configuration ---
	BASE_MODEL_ID = "microsoft/phi-2"
	PEFT_MODEL_ID = "MrunangG/phi-2-mbux-assistant"
	STT_MODEL_ID = "openai/whisper-base.en"
	TTS_MODEL_ID = "tts_models/en/vctk/vits"
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	# --- 2. Model Loading ---
	print(f"--- Loading all models on device: {DEVICE} ---")

	# ... (Model loading code is the same) ...
	model_kwargs = {"trust_remote_code": True}
	if DEVICE == "cuda":
	quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16)
	model_kwargs["quantization_config"] = quantization_config
	model_kwargs["torch_dtype"] = torch.float16
	else:
	model_kwargs["torch_dtype"] = torch.float32
	base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_ID, **model_kwargs)
	tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)
	if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
	model = PeftModel.from_pretrained(base_model, PEFT_MODEL_ID)
	print("✅ Language Model loaded successfully!")

	stt_pipe = pipeline("automatic-speech-recognition", model=STT_MODEL_ID, device=DEVICE)
	print("✅ Speech-to-Text Model loaded successfully!")

	gpu_enabled = True if DEVICE == "cuda" else False
	tts = TTS(TTS_MODEL_ID, gpu=gpu_enabled)
	print("✅ Text-to-Speech Model loaded successfully!")


	# --- 3. Full Voice-to-Voice Pipeline ---
	def voice_assistant(audio_filepath):
	"""
	Takes a filepath to an audio recording, transcribes it, gets a response from the LLM,
	and converts that response back to speech.
	"""
	if audio_filepath is None:
	return None

	# 1. Transcribe Audio to Text (STT)
	transcription = stt_pipe(audio_filepath)["text"]
	print(f"Transcription: '{transcription}'")

	# 2. Generate Text Response (LLM)
	formatted_prompt = f"[INST] {transcription} [/INST]"
	inputs = tokenizer(formatted_prompt, return_tensors="pt").to(DEVICE)
	with torch.no_grad():
	outputs = model.generate(**inputs, max_new_tokens=50, repetition_penalty=1.15)
	response_text = tokenizer.decode(outputs[0], skip_special_tokens=True).split('[/INST]')[-1].strip()
	print(f"Generated Response: '{response_text}'")

	# 3. Synthesize Speech from Text (TTS)
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
	# * THIS IS THE FIX *
	# We must specify which speaker's voice to use from the VCTK model.
	# 'p360' is just one of many available voices.
	tts.tts_to_file(text=response_text, file_path=fp.name, speaker="p360")
	return fp.name


	# --- 4. Gradio UI Definition ---
	# ... (The UI code is correct and does not need to change) ...
	cpu_warning = "" if DEVICE == "cuda" else "<div style='background-color: #FFD2D2; color: #D8000C; padding: 10px; border-radius: 5px;'><strong>Warning:</strong> No GPU detected. Performance will be very slow.</div>"
	article = f"{cpu_warning}<div style='text-align: center; margin-top: 20px;'><p>Fine-tuned model based on microsoft/phi-2. <a href='https://huggingface.co/{PEFT_MODEL_ID}' target='blank'>Model Repo</a></p></div>"
	iface = gr.Interface(
	fn=voice_assistant,
	inputs=gr.Audio(sources=["microphone"], type="filepath", label="Speak your command..."),
	outputs=gr.Audio(label="Response", autoplay=True),
	title="🚗 Voice-to-Voice MBUX Assistant",
	description="Speak a command into your microphone, and the AI assistant will respond with voice.",
	article=article,
	theme=gr.themes.Soft()
	)

	# --- 5. App Launch ---
	if __name__ == "__main__":
	iface.launch(server_name="0.0.0.0", server_port=7860)