import gradio as gr
from speechbrain.inference.VAD import VAD
import torch
import torchaudio # <--- THIS IS THE FIX
import numpy as np

# Initialize the VAD model
vad = VAD.from_hparams(source="speechbrain/vad-crdnn-libriparty")
# The model's expected sample rate
MODEL_SAMPLE_RATE = 16000

def perform_vad(audio_input):
    """
    This function takes an audio tuple from Gradio, correctly normalizes and
    re-samples the audio, and passes it directly to the VAD model.
    """
    if audio_input is None:
        return "No audio file provided.", None

    original_sample_rate, waveform_data = audio_input

    try:
        # 1. Convert the numpy array to a torch tensor
        waveform_tensor = torch.from_numpy(waveform_data).float()

        # 2. Normalize the audio to the [-1.0, 1.0] range
        if waveform_data.dtype == np.int16:
            waveform_tensor = waveform_tensor / 32768.0
        elif waveform_data.dtype == np.int32:
            waveform_tensor = waveform_tensor / 2147483648.0
        elif waveform_data.dtype == np.float32:
            pass
        else:
            max_val = torch.max(torch.abs(waveform_tensor))
            if max_val > 0:
                waveform_tensor = waveform_tensor / max_val

        # 3. Resample the audio if the sample rate is not what the model expects (16000 Hz)
        if original_sample_rate != MODEL_SAMPLE_RATE:
            resampler = torchaudio.transforms.Resample(
                orig_freq=original_sample_rate, new_freq=MODEL_SAMPLE_RATE
            )
            waveform_tensor = resampler(waveform_tensor)

        # 4. Ensure the tensor is 2D: [batch, num_samples] for the VAD model
        if waveform_tensor.ndim == 1:
            waveform_tensor = waveform_tensor.unsqueeze(0)
        elif waveform_tensor.ndim > 1:
            # If stereo, take the first channel
            waveform_tensor = waveform_tensor[0, :].unsqueeze(0)

        # Pass the perfectly formatted tensor to the VAD model
        speech_segments = vad.get_speech_segments(waveform_tensor)

        if speech_segments.shape[0] == 0:
            return "No speech detected in the audio.", None
        
        output_text = "Detected Speech Segments (startTime, endTime in seconds):\n"
        output_json = []
        
        for segment in speech_segments:
            start_sample = segment[0].item()
            end_sample = segment[1].item()
            
            start_time = round(start_sample / MODEL_SAMPLE_RATE, 3)
            end_time = round(end_sample / MODEL_SAMPLE_RATE, 3)
            
            output_text += f"- [{start_time}, {end_time}]\n"
            output_json.append({"startTime": start_time, "endTime": end_time})

        return output_text, output_json

    except Exception as e:
        return f"An error occurred: {type(e).__name__} - {str(e)}", None

# --- Gradio Interface ---
with gr.Blocks() as demo:
    gr.Markdown("# SpeechBrain VAD Demo")
    gr.Markdown("Upload an audio file to detect speech segments...")

    audio_input = gr.Audio(type="numpy", label="Upload Your Audio")
        
    process_button = gr.Button("Detect Speech")

    with gr.Row():
        text_output = gr.Textbox(label="Detected Timestamps")
        json_output = gr.JSON(label="JSON Output for Backend")

    process_button.click(
        fn=perform_vad,
        inputs=audio_input,
        outputs=[text_output, json_output]
    )

demo.launch()