Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
import gradio as gr
|
2 |
from speechbrain.inference.VAD import VAD
|
3 |
import torch
|
4 |
-
import
|
|
|
5 |
|
6 |
# Initialize the VAD model
|
7 |
vad = VAD.from_hparams(source="speechbrain/vad-crdnn-libriparty")
|
@@ -19,20 +20,17 @@ def perform_vad(audio_input):
|
|
19 |
original_sample_rate, waveform_data = audio_input
|
20 |
|
21 |
try:
|
22 |
-
# --- START OF FINAL FIX ---
|
23 |
# 1. Convert the numpy array to a torch tensor
|
24 |
waveform_tensor = torch.from_numpy(waveform_data).float()
|
25 |
|
26 |
# 2. Normalize the audio to the [-1.0, 1.0] range
|
27 |
-
# Check the data type of the numpy array to find the correct normalization factor
|
28 |
if waveform_data.dtype == np.int16:
|
29 |
waveform_tensor = waveform_tensor / 32768.0
|
30 |
elif waveform_data.dtype == np.int32:
|
31 |
waveform_tensor = waveform_tensor / 2147483648.0
|
32 |
elif waveform_data.dtype == np.float32:
|
33 |
-
pass
|
34 |
else:
|
35 |
-
# Fallback for other types
|
36 |
max_val = torch.max(torch.abs(waveform_tensor))
|
37 |
if max_val > 0:
|
38 |
waveform_tensor = waveform_tensor / max_val
|
@@ -50,7 +48,6 @@ def perform_vad(audio_input):
|
|
50 |
elif waveform_tensor.ndim > 1:
|
51 |
# If stereo, take the first channel
|
52 |
waveform_tensor = waveform_tensor[0, :].unsqueeze(0)
|
53 |
-
# --- END OF FINAL FIX ---
|
54 |
|
55 |
# Pass the perfectly formatted tensor to the VAD model
|
56 |
speech_segments = vad.get_speech_segments(waveform_tensor)
|
@@ -61,7 +58,6 @@ def perform_vad(audio_input):
|
|
61 |
output_text = "Detected Speech Segments (startTime, endTime in seconds):\n"
|
62 |
output_json = []
|
63 |
|
64 |
-
# The VAD returns timestamps based on the model's sample rate (16000)
|
65 |
for segment in speech_segments:
|
66 |
start_sample = segment[0].item()
|
67 |
end_sample = segment[1].item()
|
@@ -75,7 +71,6 @@ def perform_vad(audio_input):
|
|
75 |
return output_text, output_json
|
76 |
|
77 |
except Exception as e:
|
78 |
-
# Provide detailed error for debugging
|
79 |
return f"An error occurred: {type(e).__name__} - {str(e)}", None
|
80 |
|
81 |
# --- Gradio Interface ---
|
|
|
1 |
import gradio as gr
|
2 |
from speechbrain.inference.VAD import VAD
|
3 |
import torch
|
4 |
+
import torchaudio # <--- THIS IS THE FIX
|
5 |
+
import numpy as np
|
6 |
|
7 |
# Initialize the VAD model
|
8 |
vad = VAD.from_hparams(source="speechbrain/vad-crdnn-libriparty")
|
|
|
20 |
original_sample_rate, waveform_data = audio_input
|
21 |
|
22 |
try:
|
|
|
23 |
# 1. Convert the numpy array to a torch tensor
|
24 |
waveform_tensor = torch.from_numpy(waveform_data).float()
|
25 |
|
26 |
# 2. Normalize the audio to the [-1.0, 1.0] range
|
|
|
27 |
if waveform_data.dtype == np.int16:
|
28 |
waveform_tensor = waveform_tensor / 32768.0
|
29 |
elif waveform_data.dtype == np.int32:
|
30 |
waveform_tensor = waveform_tensor / 2147483648.0
|
31 |
elif waveform_data.dtype == np.float32:
|
32 |
+
pass
|
33 |
else:
|
|
|
34 |
max_val = torch.max(torch.abs(waveform_tensor))
|
35 |
if max_val > 0:
|
36 |
waveform_tensor = waveform_tensor / max_val
|
|
|
48 |
elif waveform_tensor.ndim > 1:
|
49 |
# If stereo, take the first channel
|
50 |
waveform_tensor = waveform_tensor[0, :].unsqueeze(0)
|
|
|
51 |
|
52 |
# Pass the perfectly formatted tensor to the VAD model
|
53 |
speech_segments = vad.get_speech_segments(waveform_tensor)
|
|
|
58 |
output_text = "Detected Speech Segments (startTime, endTime in seconds):\n"
|
59 |
output_json = []
|
60 |
|
|
|
61 |
for segment in speech_segments:
|
62 |
start_sample = segment[0].item()
|
63 |
end_sample = segment[1].item()
|
|
|
71 |
return output_text, output_json
|
72 |
|
73 |
except Exception as e:
|
|
|
74 |
return f"An error occurred: {type(e).__name__} - {str(e)}", None
|
75 |
|
76 |
# --- Gradio Interface ---
|