import gradio as gr from speechbrain.inference.VAD import VAD import torch import torchaudio # <--- THIS IS THE FIX import numpy as np # Initialize the VAD model vad = VAD.from_hparams(source="speechbrain/vad-crdnn-libriparty") # The model's expected sample rate MODEL_SAMPLE_RATE = 16000 def perform_vad(audio_input): """ This function takes an audio tuple from Gradio, correctly normalizes and re-samples the audio, and passes it directly to the VAD model. """ if audio_input is None: return "No audio file provided.", None original_sample_rate, waveform_data = audio_input try: # 1. Convert the numpy array to a torch tensor waveform_tensor = torch.from_numpy(waveform_data).float() # 2. Normalize the audio to the [-1.0, 1.0] range if waveform_data.dtype == np.int16: waveform_tensor = waveform_tensor / 32768.0 elif waveform_data.dtype == np.int32: waveform_tensor = waveform_tensor / 2147483648.0 elif waveform_data.dtype == np.float32: pass else: max_val = torch.max(torch.abs(waveform_tensor)) if max_val > 0: waveform_tensor = waveform_tensor / max_val # 3. Resample the audio if the sample rate is not what the model expects (16000 Hz) if original_sample_rate != MODEL_SAMPLE_RATE: resampler = torchaudio.transforms.Resample( orig_freq=original_sample_rate, new_freq=MODEL_SAMPLE_RATE ) waveform_tensor = resampler(waveform_tensor) # 4. Ensure the tensor is 2D: [batch, num_samples] for the VAD model if waveform_tensor.ndim == 1: waveform_tensor = waveform_tensor.unsqueeze(0) elif waveform_tensor.ndim > 1: # If stereo, take the first channel waveform_tensor = waveform_tensor[0, :].unsqueeze(0) # Pass the perfectly formatted tensor to the VAD model speech_segments = vad.get_speech_segments(waveform_tensor) if speech_segments.shape[0] == 0: return "No speech detected in the audio.", None output_text = "Detected Speech Segments (startTime, endTime in seconds):\n" output_json = [] for segment in speech_segments: start_sample = segment[0].item() end_sample = segment[1].item() start_time = round(start_sample / MODEL_SAMPLE_RATE, 3) end_time = round(end_sample / MODEL_SAMPLE_RATE, 3) output_text += f"- [{start_time}, {end_time}]\n" output_json.append({"startTime": start_time, "endTime": end_time}) return output_text, output_json except Exception as e: return f"An error occurred: {type(e).__name__} - {str(e)}", None # --- Gradio Interface --- with gr.Blocks() as demo: gr.Markdown("# SpeechBrain VAD Demo") gr.Markdown("Upload an audio file to detect speech segments...") audio_input = gr.Audio(type="numpy", label="Upload Your Audio") process_button = gr.Button("Detect Speech") with gr.Row(): text_output = gr.Textbox(label="Detected Timestamps") json_output = gr.JSON(label="JSON Output for Backend") process_button.click( fn=perform_vad, inputs=audio_input, outputs=[text_output, json_output] ) demo.launch()