Spaces:
Runtime error
Runtime error
import io | |
import torch | |
import librosa | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from PIL import Image | |
from silero_vad import (load_silero_vad, | |
read_audio, | |
get_speech_timestamps, | |
save_audio, | |
VADIterator, | |
collect_chunks) | |
USE_ONNX = False | |
model = load_silero_vad(onnx=USE_ONNX) | |
SAMPLING_RATE = 16000 | |
def silero_vad_remove_silence(audio_file_path): | |
torch.set_num_threads(1) | |
audio = read_audio(audio_file_path, sampling_rate=SAMPLING_RATE) | |
# Get speech timestamps from full audio file | |
speech_timestamps = get_speech_timestamps(audio, model, sampling_rate=SAMPLING_RATE) | |
if not speech_timestamps: | |
print(f"No speech detected in {audio_file_path}. Returning original audio.") | |
return audio # Return unmodified audio | |
else: | |
# Merge all speech chunks and return the result | |
processed_audio = collect_chunks(speech_timestamps, audio) | |
return processed_audio | |
def create_mel_spectrograms(file_path, segment_duration, start_offset): | |
# Trvanie klipu spektrogramu | |
duration = segment_duration | |
startOffset = start_offset | |
pil_images = [] | |
processed_audio = silero_vad_remove_silence(file_path) | |
y = processed_audio.numpy() | |
sr = SAMPLING_RATE | |
# Výpočet trvania načítaného zvuku v sekundách | |
audio_duration = librosa.get_duration(y=y, sr=sr) | |
# Trvanie každého segmentu vo vzorkách | |
segment_duration_samples = int(duration * sr) | |
# Vypočítaj najbližšie celé číslo sekundy | |
rounded_duration = int(np.round(audio_duration)) | |
# Skratenie segmentu signálu | |
if len(y) > rounded_duration * sr: | |
# Ak je zvukový signál dlhší ako zaokrúhlené trvanie | |
y = y[:rounded_duration * sr] | |
elif len(y) < rounded_duration * sr: | |
# Ak je zvukový signál kratší ako zaokrúhlené trvanie | |
y = np.pad(y, (0, rounded_duration * sr - len(y)), mode='constant') | |
# Prechádzanie každej vypocitanej dĺžky zvuku | |
for i in range(int(rounded_duration)): | |
# Výpočet počiatočného indexu vzorky aktuálneho segmentu | |
start_sample = i * sr | |
# Výpočet koncového indexu vzorky aktuálneho segmentu | |
end_sample = start_sample + segment_duration_samples | |
# Kontrola či end_sample presahuje dĺžku zvukového signálu | |
if end_sample > len(y): | |
continue | |
y_segment = y[start_sample:end_sample] | |
if len(y_segment) > 0: | |
# Spektrogram | |
# Odovzdávanie argumentov pre Mel filtre | |
S = librosa.feature.melspectrogram(y=y_segment, sr=sr, n_mels=128, fmax=8000, center=True) | |
# # Vytvorenie nového obrázku a osi pre spektrogram s konkrétnou veľkosťou (256x256 pixelov) | |
fig, ax = plt.subplots(figsize=(224 / 100, 224 / 100)) # 100 DPI (dots per inch) pre prevod pixelov na palce | |
# Prevod spektrogramu na decibely pomocou funkcie power_to_db | |
S_dB = librosa.power_to_db(S, ref=np.max) | |
# Zobrazenie spektrogramu na osi pomocou specshow | |
img = librosa.display.specshow(S_dB, sr=sr, fmax=8000, ax=ax) | |
# Nastavenie osí, aby zodpovedali obsahu spektrogramu | |
ax.set_xlim(0, S.shape[-1]) # Nastavenie osi x na počet snímok v spektrograme | |
ax.set_ylim(0, S.shape[0]) # Nastavenie osi y na počet mel | |
# Odstránenie popisov osí | |
ax.set_xticks([]) | |
ax.set_yticks([]) | |
ax.set_xticklabels([]) | |
ax.set_yticklabels([]) | |
buffer = io.BytesIO() | |
plt.savefig(buffer, format='PNG', bbox_inches=None, pad_inches=0, | |
dpi=100, transparent=True) | |
buffer.seek(0) | |
# Convert buffer to PIL Image | |
pil_image = Image.open(buffer) | |
pil_images.append(pil_image.copy()) # Copy to avoid buffer issues | |
# Close buffer and figure to free memory | |
buffer.close() | |
plt.close(fig) | |
print(pil_images) | |
return pil_images |