mohAbdullah commited on
Commit
9c67c5f
·
verified ·
1 Parent(s): 28f0015

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -8
app.py CHANGED
@@ -1,7 +1,8 @@
1
  import gradio as gr
2
  from speechbrain.inference.VAD import VAD
3
  import torch
4
- import numpy as np # Import numpy for data type info
 
5
 
6
  # Initialize the VAD model
7
  vad = VAD.from_hparams(source="speechbrain/vad-crdnn-libriparty")
@@ -19,20 +20,17 @@ def perform_vad(audio_input):
19
  original_sample_rate, waveform_data = audio_input
20
 
21
  try:
22
- # --- START OF FINAL FIX ---
23
  # 1. Convert the numpy array to a torch tensor
24
  waveform_tensor = torch.from_numpy(waveform_data).float()
25
 
26
  # 2. Normalize the audio to the [-1.0, 1.0] range
27
- # Check the data type of the numpy array to find the correct normalization factor
28
  if waveform_data.dtype == np.int16:
29
  waveform_tensor = waveform_tensor / 32768.0
30
  elif waveform_data.dtype == np.int32:
31
  waveform_tensor = waveform_tensor / 2147483648.0
32
  elif waveform_data.dtype == np.float32:
33
- pass # Already in the correct range
34
  else:
35
- # Fallback for other types
36
  max_val = torch.max(torch.abs(waveform_tensor))
37
  if max_val > 0:
38
  waveform_tensor = waveform_tensor / max_val
@@ -50,7 +48,6 @@ def perform_vad(audio_input):
50
  elif waveform_tensor.ndim > 1:
51
  # If stereo, take the first channel
52
  waveform_tensor = waveform_tensor[0, :].unsqueeze(0)
53
- # --- END OF FINAL FIX ---
54
 
55
  # Pass the perfectly formatted tensor to the VAD model
56
  speech_segments = vad.get_speech_segments(waveform_tensor)
@@ -61,7 +58,6 @@ def perform_vad(audio_input):
61
  output_text = "Detected Speech Segments (startTime, endTime in seconds):\n"
62
  output_json = []
63
 
64
- # The VAD returns timestamps based on the model's sample rate (16000)
65
  for segment in speech_segments:
66
  start_sample = segment[0].item()
67
  end_sample = segment[1].item()
@@ -75,7 +71,6 @@ def perform_vad(audio_input):
75
  return output_text, output_json
76
 
77
  except Exception as e:
78
- # Provide detailed error for debugging
79
  return f"An error occurred: {type(e).__name__} - {str(e)}", None
80
 
81
  # --- Gradio Interface ---
 
1
  import gradio as gr
2
  from speechbrain.inference.VAD import VAD
3
  import torch
4
+ import torchaudio # <--- THIS IS THE FIX
5
+ import numpy as np
6
 
7
  # Initialize the VAD model
8
  vad = VAD.from_hparams(source="speechbrain/vad-crdnn-libriparty")
 
20
  original_sample_rate, waveform_data = audio_input
21
 
22
  try:
 
23
  # 1. Convert the numpy array to a torch tensor
24
  waveform_tensor = torch.from_numpy(waveform_data).float()
25
 
26
  # 2. Normalize the audio to the [-1.0, 1.0] range
 
27
  if waveform_data.dtype == np.int16:
28
  waveform_tensor = waveform_tensor / 32768.0
29
  elif waveform_data.dtype == np.int32:
30
  waveform_tensor = waveform_tensor / 2147483648.0
31
  elif waveform_data.dtype == np.float32:
32
+ pass
33
  else:
 
34
  max_val = torch.max(torch.abs(waveform_tensor))
35
  if max_val > 0:
36
  waveform_tensor = waveform_tensor / max_val
 
48
  elif waveform_tensor.ndim > 1:
49
  # If stereo, take the first channel
50
  waveform_tensor = waveform_tensor[0, :].unsqueeze(0)
 
51
 
52
  # Pass the perfectly formatted tensor to the VAD model
53
  speech_segments = vad.get_speech_segments(waveform_tensor)
 
58
  output_text = "Detected Speech Segments (startTime, endTime in seconds):\n"
59
  output_json = []
60
 
 
61
  for segment in speech_segments:
62
  start_sample = segment[0].item()
63
  end_sample = segment[1].item()
 
71
  return output_text, output_json
72
 
73
  except Exception as e:
 
74
  return f"An error occurred: {type(e).__name__} - {str(e)}", None
75
 
76
  # --- Gradio Interface ---