Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -8,7 +8,7 @@ import gradio as gr
|
|
8 |
import tempfile
|
9 |
import os
|
10 |
|
11 |
-
llasa_1b ='
|
12 |
|
13 |
tokenizer = AutoTokenizer.from_pretrained(llasa_1b, token=os.getenv("HF_TOKEN"))
|
14 |
|
@@ -31,6 +31,59 @@ whisper_turbo_pipe = pipeline(
|
|
31 |
device='cuda',
|
32 |
)
|
33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
def ids_to_speech_tokens(speech_ids):
|
35 |
|
36 |
speech_tokens_str = []
|
@@ -56,6 +109,11 @@ def infer(sample_audio_path, target_text, progress=gr.Progress()):
|
|
56 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
57 |
progress(0, 'Loading and trimming audio...')
|
58 |
waveform, sample_rate = torchaudio.load(sample_audio_path)
|
|
|
|
|
|
|
|
|
|
|
59 |
if len(waveform[0])/sample_rate > 15:
|
60 |
gr.Warning("Trimming audio to first 15secs.")
|
61 |
waveform = waveform[:, :sample_rate*15]
|
|
|
8 |
import tempfile
|
9 |
import os
|
10 |
|
11 |
+
llasa_1b ='/media/bodza/Audio_Dataset/Llasa-Kartoffel-1B-v0.2'
|
12 |
|
13 |
tokenizer = AutoTokenizer.from_pretrained(llasa_1b, token=os.getenv("HF_TOKEN"))
|
14 |
|
|
|
31 |
device='cuda',
|
32 |
)
|
33 |
|
34 |
+
|
35 |
+
vad_model, utils = torch.hub.load(
|
36 |
+
"snakers4/silero-vad",
|
37 |
+
model="silero_vad",
|
38 |
+
force_reload=False,
|
39 |
+
source="github")
|
40 |
+
|
41 |
+
get_speech_timestamps, *_ = utils
|
42 |
+
|
43 |
+
|
44 |
+
def remove_silence_silero(waveform, sample_rate, vad_model):
|
45 |
+
"""
|
46 |
+
Remove leading silence using Silero VAD.
|
47 |
+
|
48 |
+
Args:
|
49 |
+
waveform: torch.Tensor audio waveform (channels, samples)
|
50 |
+
sample_rate: int sample rate
|
51 |
+
vad_model: Silero VAD model
|
52 |
+
"""
|
53 |
+
if waveform.size(0) > 1:
|
54 |
+
waveform = torch.mean(waveform, dim=0, keepdim=True)
|
55 |
+
|
56 |
+
original_waveform = waveform
|
57 |
+
|
58 |
+
if sample_rate != 16000:
|
59 |
+
waveform_16k = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
|
60 |
+
else:
|
61 |
+
waveform_16k = waveform
|
62 |
+
|
63 |
+
# Get speech timestamps
|
64 |
+
speech_timestamps = get_speech_timestamps(waveform_16k[0], vad_model, sampling_rate=16000)
|
65 |
+
|
66 |
+
if speech_timestamps:
|
67 |
+
# Get first speech segment start
|
68 |
+
first_speech = speech_timestamps[0]['start']
|
69 |
+
|
70 |
+
# Add small padding before speech (0.1 seconds)
|
71 |
+
padding_samples = int(0.1 * sample_rate)
|
72 |
+
start_idx = max(0, int(first_speech * sample_rate/16000) - padding_samples)
|
73 |
+
|
74 |
+
# Same for the end
|
75 |
+
last_speech = speech_timestamps[-1]['end']
|
76 |
+
end_idx = min(original_waveform.size(1), int(last_speech * sample_rate/16000) + padding_samples)
|
77 |
+
|
78 |
+
# Trim the original waveform (not the resampled one)
|
79 |
+
trimmed_wav = original_waveform[:, start_idx:end_idx]
|
80 |
+
|
81 |
+
# added padding of 16 at the start and end
|
82 |
+
return torch.nn.functional.pad(trimmed_wav, (16, 16), "constant", 0)
|
83 |
+
|
84 |
+
return original_waveform
|
85 |
+
|
86 |
+
|
87 |
def ids_to_speech_tokens(speech_ids):
|
88 |
|
89 |
speech_tokens_str = []
|
|
|
109 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
110 |
progress(0, 'Loading and trimming audio...')
|
111 |
waveform, sample_rate = torchaudio.load(sample_audio_path)
|
112 |
+
waveform = remove_silence_silero(waveform, sample_rate, vad_model)
|
113 |
+
|
114 |
+
# For debugging save the trimmed audio
|
115 |
+
torchaudio.save("dev.wav", waveform, sample_rate)
|
116 |
+
|
117 |
if len(waveform[0])/sample_rate > 15:
|
118 |
gr.Warning("Trimming audio to first 15secs.")
|
119 |
waveform = waveform[:, :sample_rate*15]
|