# RAG/app.py (Corrected TTS Speaker)
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from peft import PeftModel
from TTS.api import TTS
import tempfile
import os

# --- 1. Configuration ---
BASE_MODEL_ID = "microsoft/phi-2"
PEFT_MODEL_ID = "MrunangG/phi-2-mbux-assistant"
STT_MODEL_ID = "openai/whisper-base.en"
TTS_MODEL_ID = "tts_models/en/vctk/vits"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# --- 2. Model Loading ---
print(f"--- Loading all models on device: {DEVICE} ---")

# ... (Model loading code is the same) ...
model_kwargs = {"trust_remote_code": True}
if DEVICE == "cuda":
    quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16)
    model_kwargs["quantization_config"] = quantization_config
    model_kwargs["torch_dtype"] = torch.float16
else:
    model_kwargs["torch_dtype"] = torch.float32
base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_ID, **model_kwargs)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)
if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
model = PeftModel.from_pretrained(base_model, PEFT_MODEL_ID)
print("✅ Language Model loaded successfully!")

stt_pipe = pipeline("automatic-speech-recognition", model=STT_MODEL_ID, device=DEVICE)
print("✅ Speech-to-Text Model loaded successfully!")

gpu_enabled = True if DEVICE == "cuda" else False
tts = TTS(TTS_MODEL_ID, gpu=gpu_enabled)
print("✅ Text-to-Speech Model loaded successfully!")


# --- 3. Full Voice-to-Voice Pipeline ---
def voice_assistant(audio_filepath):
    """
    Takes a filepath to an audio recording, transcribes it, gets a response from the LLM,
    and converts that response back to speech.
    """
    if audio_filepath is None:
        return None

    # 1. Transcribe Audio to Text (STT)
    transcription = stt_pipe(audio_filepath)["text"]
    print(f"Transcription: '{transcription}'")

    # 2. Generate Text Response (LLM)
    formatted_prompt = f"[INST] {transcription} [/INST]"
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=50, repetition_penalty=1.15)
    response_text = tokenizer.decode(outputs[0], skip_special_tokens=True).split('[/INST]')[-1].strip()
    print(f"Generated Response: '{response_text}'")

    # 3. Synthesize Speech from Text (TTS)
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
        # *** THIS IS THE FIX ***
        # We must specify which speaker's voice to use from the VCTK model.
        # 'p360' is just one of many available voices.
        tts.tts_to_file(text=response_text, file_path=fp.name, speaker="p360")
        return fp.name


# --- 4. Gradio UI Definition ---
# ... (The UI code is correct and does not need to change) ...
cpu_warning = "" if DEVICE == "cuda" else "<div style='background-color: #FFD2D2; color: #D8000C; padding: 10px; border-radius: 5px;'><strong>Warning:</strong> No GPU detected. Performance will be very slow.</div>"
article = f"{cpu_warning}<div style='text-align: center; margin-top: 20px;'><p>Fine-tuned model based on microsoft/phi-2. <a href='https://huggingface.co/{PEFT_MODEL_ID}' target='blank'>Model Repo</a></p></div>"
iface = gr.Interface(
    fn=voice_assistant,
    inputs=gr.Audio(sources=["microphone"], type="filepath", label="Speak your command..."),
    outputs=gr.Audio(label="Response", autoplay=True),
    title="🚗 Voice-to-Voice MBUX Assistant",
    description="Speak a command into your microphone, and the AI assistant will respond with voice.",
    article=article,
    theme=gr.themes.Soft()
)

# --- 5. App Launch ---
if __name__ == "__main__":
    iface.launch(server_name="0.0.0.0", server_port=7860)