Spaces:
Runtime error
Runtime error
File size: 3,656 Bytes
a67745e 4ecb56b a67745e 4ecb56b a67745e 4ecb56b a67745e 4ecb56b a67745e 4ecb56b a67745e 4ecb56b a67745e 4ecb56b a67745e 4ecb56b a67745e 4ecb56b a67745e 4ecb56b a67745e 4ecb56b a67745e 4ecb56b a67745e 01d3f3f a67745e 4ecb56b a67745e 4ecb56b a252329 4ecb56b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
"""
Module needed for pre-processing of uploaded audio
Uses silero_vad for silence removal and librosa for image generation
Author: Jakub Polnis
Copyright: Copyright 2025, Jakub Polnis
License: Apache 2.0
Email: [email protected]
"""
import io
import torch
import librosa
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from silero_vad import (load_silero_vad,
read_audio,
get_speech_timestamps,
save_audio,
VADIterator,
collect_chunks)
USE_ONNX = False
model = load_silero_vad(onnx=USE_ONNX)
SAMPLING_RATE = 16000
def silero_vad_remove_silence(audio_file_path):
torch.set_num_threads(1)
audio = read_audio(audio_file_path, sampling_rate=SAMPLING_RATE)
# Get speech timestamps from full audio file
speech_timestamps = get_speech_timestamps(audio, model, sampling_rate=SAMPLING_RATE)
if not speech_timestamps:
print(f"No speech detected in {audio_file_path}. Returning original audio.")
return audio # Return unmodified audio
else:
# Merge all speech chunks and return the result
processed_audio = collect_chunks(speech_timestamps, audio)
return processed_audio
def create_mel_spectrograms(file_path, segment_duration, start_offset):
duration = segment_duration
startOffset = start_offset
pil_images = []
# Call silero_vad to remove silence
processed_audio = silero_vad_remove_silence(file_path)
y = processed_audio.numpy()
sr = SAMPLING_RATE
# Calc duration of audio in seconds
audio_duration = librosa.get_duration(y=y, sr=sr)
# Calc duration of audio file in samples
segment_duration_samples = int(duration * sr)
# Calc the closest round number in seconds
rounded_duration = int(np.round(audio_duration))
# Trim the signal
if len(y) > rounded_duration * sr:
y = y[:rounded_duration * sr]
elif len(y) < rounded_duration * sr:
y = np.pad(y, (0, rounded_duration * sr - len(y)), mode='constant')
# Loop through the signal
for i in range(int(rounded_duration)):
# Starting index
start_sample = i * sr
# End index
end_sample = start_sample + segment_duration_samples
if end_sample > len(y):
continue
y_segment = y[start_sample:end_sample]
if len(y_segment) > 0:
# Creat mel-spectrogram
S = librosa.feature.melspectrogram(y=y_segment, sr=sr, n_mels=128, fmax=8000, center=True)
# Save it as img
fig, ax = plt.subplots(figsize=(224 / 100, 224 / 100))
# power_to_db
S_dB = librosa.power_to_db(S, ref=np.max)
# Setup axis
img = librosa.display.specshow(S_dB, sr=sr, fmax=8000, ax=ax)
ax.set_xlim(0, S.shape[-1])
ax.set_ylim(0, S.shape[0])
ax.set_xticks([])
ax.set_yticks([])
ax.set_xticklabels([])
ax.set_yticklabels([])
# Save into the buffer so we can return PIL images
buffer = io.BytesIO()
plt.savefig(buffer, format='PNG', bbox_inches=None, pad_inches=0,
dpi=100, transparent=True)
buffer.seek(0)
# Convert buffer to PIL Image
pil_image = Image.open(buffer)
pil_images.append(pil_image.copy()) # Copy to avoid buffer issues
# Close buffer and figure to free memory
buffer.close()
plt.close(fig)
print(pil_images)
return pil_images |