# RAG/app.py (Corrected TTS Speaker) import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline from peft import PeftModel from TTS.api import TTS import tempfile import os # --- 1. Configuration --- BASE_MODEL_ID = "microsoft/phi-2" PEFT_MODEL_ID = "MrunangG/phi-2-mbux-assistant" STT_MODEL_ID = "openai/whisper-base.en" TTS_MODEL_ID = "tts_models/en/vctk/vits" DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # --- 2. Model Loading --- print(f"--- Loading all models on device: {DEVICE} ---") # ... (Model loading code is the same) ... model_kwargs = {"trust_remote_code": True} if DEVICE == "cuda": quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16) model_kwargs["quantization_config"] = quantization_config model_kwargs["torch_dtype"] = torch.float16 else: model_kwargs["torch_dtype"] = torch.float32 base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_ID, **model_kwargs) tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model = PeftModel.from_pretrained(base_model, PEFT_MODEL_ID) print("✅ Language Model loaded successfully!") stt_pipe = pipeline("automatic-speech-recognition", model=STT_MODEL_ID, device=DEVICE) print("✅ Speech-to-Text Model loaded successfully!") gpu_enabled = True if DEVICE == "cuda" else False tts = TTS(TTS_MODEL_ID, gpu=gpu_enabled) print("✅ Text-to-Speech Model loaded successfully!") # --- 3. Full Voice-to-Voice Pipeline --- def voice_assistant(audio_filepath): """ Takes a filepath to an audio recording, transcribes it, gets a response from the LLM, and converts that response back to speech. """ if audio_filepath is None: return None # 1. Transcribe Audio to Text (STT) transcription = stt_pipe(audio_filepath)["text"] print(f"Transcription: '{transcription}'") # 2. Generate Text Response (LLM) formatted_prompt = f"[INST] {transcription} [/INST]" inputs = tokenizer(formatted_prompt, return_tensors="pt").to(DEVICE) with torch.no_grad(): outputs = model.generate(**inputs, max_new_tokens=50, repetition_penalty=1.15) response_text = tokenizer.decode(outputs[0], skip_special_tokens=True).split('[/INST]')[-1].strip() print(f"Generated Response: '{response_text}'") # 3. Synthesize Speech from Text (TTS) with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: # *** THIS IS THE FIX *** # We must specify which speaker's voice to use from the VCTK model. # 'p360' is just one of many available voices. tts.tts_to_file(text=response_text, file_path=fp.name, speaker="p360") return fp.name # --- 4. Gradio UI Definition --- # ... (The UI code is correct and does not need to change) ... cpu_warning = "" if DEVICE == "cuda" else "
Warning: No GPU detected. Performance will be very slow.
" article = f"{cpu_warning}

Fine-tuned model based on microsoft/phi-2. Model Repo

" iface = gr.Interface( fn=voice_assistant, inputs=gr.Audio(sources=["microphone"], type="filepath", label="Speak your command..."), outputs=gr.Audio(label="Response", autoplay=True), title="🚗 Voice-to-Voice MBUX Assistant", description="Speak a command into your microphone, and the AI assistant will respond with voice.", article=article, theme=gr.themes.Soft() ) # --- 5. App Launch --- if __name__ == "__main__": iface.launch(server_name="0.0.0.0", server_port=7860)