SebastianBodza commited on
Commit
17fe572
·
verified ·
1 Parent(s): fdb1fce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -1
app.py CHANGED
@@ -8,7 +8,7 @@ import gradio as gr
8
  import tempfile
9
  import os
10
 
11
- llasa_1b ='SebastianBodza/Kartoffel-1B-v0.2'
12
 
13
  tokenizer = AutoTokenizer.from_pretrained(llasa_1b, token=os.getenv("HF_TOKEN"))
14
 
@@ -31,6 +31,59 @@ whisper_turbo_pipe = pipeline(
31
  device='cuda',
32
  )
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  def ids_to_speech_tokens(speech_ids):
35
 
36
  speech_tokens_str = []
@@ -56,6 +109,11 @@ def infer(sample_audio_path, target_text, progress=gr.Progress()):
56
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
57
  progress(0, 'Loading and trimming audio...')
58
  waveform, sample_rate = torchaudio.load(sample_audio_path)
 
 
 
 
 
59
  if len(waveform[0])/sample_rate > 15:
60
  gr.Warning("Trimming audio to first 15secs.")
61
  waveform = waveform[:, :sample_rate*15]
 
8
  import tempfile
9
  import os
10
 
11
+ llasa_1b ='/media/bodza/Audio_Dataset/Llasa-Kartoffel-1B-v0.2'
12
 
13
  tokenizer = AutoTokenizer.from_pretrained(llasa_1b, token=os.getenv("HF_TOKEN"))
14
 
 
31
  device='cuda',
32
  )
33
 
34
+
35
+ vad_model, utils = torch.hub.load(
36
+ "snakers4/silero-vad",
37
+ model="silero_vad",
38
+ force_reload=False,
39
+ source="github")
40
+
41
+ get_speech_timestamps, *_ = utils
42
+
43
+
44
+ def remove_silence_silero(waveform, sample_rate, vad_model):
45
+ """
46
+ Remove leading silence using Silero VAD.
47
+
48
+ Args:
49
+ waveform: torch.Tensor audio waveform (channels, samples)
50
+ sample_rate: int sample rate
51
+ vad_model: Silero VAD model
52
+ """
53
+ if waveform.size(0) > 1:
54
+ waveform = torch.mean(waveform, dim=0, keepdim=True)
55
+
56
+ original_waveform = waveform
57
+
58
+ if sample_rate != 16000:
59
+ waveform_16k = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
60
+ else:
61
+ waveform_16k = waveform
62
+
63
+ # Get speech timestamps
64
+ speech_timestamps = get_speech_timestamps(waveform_16k[0], vad_model, sampling_rate=16000)
65
+
66
+ if speech_timestamps:
67
+ # Get first speech segment start
68
+ first_speech = speech_timestamps[0]['start']
69
+
70
+ # Add small padding before speech (0.1 seconds)
71
+ padding_samples = int(0.1 * sample_rate)
72
+ start_idx = max(0, int(first_speech * sample_rate/16000) - padding_samples)
73
+
74
+ # Same for the end
75
+ last_speech = speech_timestamps[-1]['end']
76
+ end_idx = min(original_waveform.size(1), int(last_speech * sample_rate/16000) + padding_samples)
77
+
78
+ # Trim the original waveform (not the resampled one)
79
+ trimmed_wav = original_waveform[:, start_idx:end_idx]
80
+
81
+ # added padding of 16 at the start and end
82
+ return torch.nn.functional.pad(trimmed_wav, (16, 16), "constant", 0)
83
+
84
+ return original_waveform
85
+
86
+
87
  def ids_to_speech_tokens(speech_ids):
88
 
89
  speech_tokens_str = []
 
109
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
110
  progress(0, 'Loading and trimming audio...')
111
  waveform, sample_rate = torchaudio.load(sample_audio_path)
112
+ waveform = remove_silence_silero(waveform, sample_rate, vad_model)
113
+
114
+ # For debugging save the trimmed audio
115
+ torchaudio.save("dev.wav", waveform, sample_rate)
116
+
117
  if len(waveform[0])/sample_rate > 15:
118
  gr.Warning("Trimming audio to first 15secs.")
119
  waveform = waveform[:, :sample_rate*15]