Spaces:
Running
Running
# RAG/app.py (Corrected TTS Speaker) | |
import gradio as gr | |
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline | |
from peft import PeftModel | |
from TTS.api import TTS | |
import tempfile | |
import os | |
# --- 1. Configuration --- | |
BASE_MODEL_ID = "microsoft/phi-2" | |
PEFT_MODEL_ID = "MrunangG/phi-2-mbux-assistant" | |
STT_MODEL_ID = "openai/whisper-base.en" | |
TTS_MODEL_ID = "tts_models/en/vctk/vits" | |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
# --- 2. Model Loading --- | |
print(f"--- Loading all models on device: {DEVICE} ---") | |
# ... (Model loading code is the same) ... | |
model_kwargs = {"trust_remote_code": True} | |
if DEVICE == "cuda": | |
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16) | |
model_kwargs["quantization_config"] = quantization_config | |
model_kwargs["torch_dtype"] = torch.float16 | |
else: | |
model_kwargs["torch_dtype"] = torch.float32 | |
base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_ID, **model_kwargs) | |
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True) | |
if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token | |
model = PeftModel.from_pretrained(base_model, PEFT_MODEL_ID) | |
print("β Language Model loaded successfully!") | |
stt_pipe = pipeline("automatic-speech-recognition", model=STT_MODEL_ID, device=DEVICE) | |
print("β Speech-to-Text Model loaded successfully!") | |
gpu_enabled = True if DEVICE == "cuda" else False | |
tts = TTS(TTS_MODEL_ID, gpu=gpu_enabled) | |
print("β Text-to-Speech Model loaded successfully!") | |
# --- 3. Full Voice-to-Voice Pipeline --- | |
def voice_assistant(audio_filepath): | |
""" | |
Takes a filepath to an audio recording, transcribes it, gets a response from the LLM, | |
and converts that response back to speech. | |
""" | |
if audio_filepath is None: | |
return None | |
# 1. Transcribe Audio to Text (STT) | |
transcription = stt_pipe(audio_filepath)["text"] | |
print(f"Transcription: '{transcription}'") | |
# 2. Generate Text Response (LLM) | |
formatted_prompt = f"[INST] {transcription} [/INST]" | |
inputs = tokenizer(formatted_prompt, return_tensors="pt").to(DEVICE) | |
with torch.no_grad(): | |
outputs = model.generate(**inputs, max_new_tokens=50, repetition_penalty=1.15) | |
response_text = tokenizer.decode(outputs[0], skip_special_tokens=True).split('[/INST]')[-1].strip() | |
print(f"Generated Response: '{response_text}'") | |
# 3. Synthesize Speech from Text (TTS) | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: | |
# *** THIS IS THE FIX *** | |
# We must specify which speaker's voice to use from the VCTK model. | |
# 'p360' is just one of many available voices. | |
tts.tts_to_file(text=response_text, file_path=fp.name, speaker="p360") | |
return fp.name | |
# --- 4. Gradio UI Definition --- | |
# ... (The UI code is correct and does not need to change) ... | |
cpu_warning = "" if DEVICE == "cuda" else "<div style='background-color: #FFD2D2; color: #D8000C; padding: 10px; border-radius: 5px;'><strong>Warning:</strong> No GPU detected. Performance will be very slow.</div>" | |
article = f"{cpu_warning}<div style='text-align: center; margin-top: 20px;'><p>Fine-tuned model based on microsoft/phi-2. <a href='https://huggingface.co/{PEFT_MODEL_ID}' target='blank'>Model Repo</a></p></div>" | |
iface = gr.Interface( | |
fn=voice_assistant, | |
inputs=gr.Audio(sources=["microphone"], type="filepath", label="Speak your command..."), | |
outputs=gr.Audio(label="Response", autoplay=True), | |
title="π Voice-to-Voice MBUX Assistant", | |
description="Speak a command into your microphone, and the AI assistant will respond with voice.", | |
article=article, | |
theme=gr.themes.Soft() | |
) | |
# --- 5. App Launch --- | |
if __name__ == "__main__": | |
iface.launch(server_name="0.0.0.0", server_port=7860) |