File size: 3,656 Bytes
a67745e
 
 
 
 
 
 
 
 
 
4ecb56b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a67745e
4ecb56b
 
 
 
a67745e
4ecb56b
 
a67745e
4ecb56b
 
a67745e
4ecb56b
 
a67745e
4ecb56b
 
 
 
 
a67745e
4ecb56b
a67745e
4ecb56b
 
a67745e
4ecb56b
 
 
 
 
 
 
 
 
a67745e
4ecb56b
 
a67745e
 
4ecb56b
a67745e
4ecb56b
 
a67745e
01d3f3f
a67745e
 
4ecb56b
 
 
 
 
a67745e
4ecb56b
 
 
 
 
 
 
 
 
 
 
 
 
a252329
 
4ecb56b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
"""
Module needed for pre-processing of uploaded audio
Uses silero_vad for silence removal and librosa for image generation

Author: Jakub Polnis
Copyright: Copyright 2025, Jakub Polnis
License: Apache 2.0
Email: [email protected]
"""

import io
import torch
import librosa
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from silero_vad import (load_silero_vad,
                          read_audio,
                          get_speech_timestamps,
                          save_audio,
                          VADIterator,
                          collect_chunks)
USE_ONNX = False
model = load_silero_vad(onnx=USE_ONNX)

SAMPLING_RATE = 16000

def silero_vad_remove_silence(audio_file_path):
    torch.set_num_threads(1)
    audio = read_audio(audio_file_path, sampling_rate=SAMPLING_RATE)

    # Get speech timestamps from full audio file
    speech_timestamps = get_speech_timestamps(audio, model, sampling_rate=SAMPLING_RATE)

    if not speech_timestamps:
        print(f"No speech detected in {audio_file_path}. Returning original audio.")
        return audio  # Return unmodified audio
    else:
        # Merge all speech chunks and return the result
        processed_audio = collect_chunks(speech_timestamps, audio)
        return processed_audio

def create_mel_spectrograms(file_path, segment_duration, start_offset):

    duration = segment_duration
    startOffset = start_offset
    pil_images = []

    # Call silero_vad to remove silence
    processed_audio = silero_vad_remove_silence(file_path)
    y = processed_audio.numpy()
    sr = SAMPLING_RATE

    # Calc duration of audio in seconds
    audio_duration = librosa.get_duration(y=y, sr=sr)

    # Calc duration of audio file in samples
    segment_duration_samples = int(duration * sr)

    # Calc the closest round number in seconds
    rounded_duration = int(np.round(audio_duration))

    # Trim the signal
    if len(y) > rounded_duration * sr:
        y = y[:rounded_duration * sr]
    elif len(y) < rounded_duration * sr:
        y = np.pad(y, (0, rounded_duration * sr - len(y)), mode='constant')

    # Loop through the signal
    for i in range(int(rounded_duration)):
        # Starting index
        start_sample = i * sr

        # End index
        end_sample = start_sample + segment_duration_samples

        if end_sample > len(y):
            continue

        y_segment = y[start_sample:end_sample]

        if len(y_segment) > 0:

            # Creat mel-spectrogram
            S = librosa.feature.melspectrogram(y=y_segment, sr=sr, n_mels=128, fmax=8000, center=True)

            # Save it as img
            fig, ax = plt.subplots(figsize=(224 / 100, 224 / 100))

            # power_to_db
            S_dB = librosa.power_to_db(S, ref=np.max)

            # Setup axis
            img = librosa.display.specshow(S_dB, sr=sr, fmax=8000, ax=ax)
            ax.set_xlim(0, S.shape[-1])
            ax.set_ylim(0, S.shape[0])
            ax.set_xticks([])
            ax.set_yticks([])
            ax.set_xticklabels([])
            ax.set_yticklabels([])

            # Save into the buffer so we can return PIL images
            buffer = io.BytesIO()
            plt.savefig(buffer, format='PNG', bbox_inches=None, pad_inches=0,
                        dpi=100, transparent=True)
            buffer.seek(0)

            # Convert buffer to PIL Image
            pil_image = Image.open(buffer)
            pil_images.append(pil_image.copy())  # Copy to avoid buffer issues

            # Close buffer and figure to free memory
            buffer.close()
            plt.close(fig)

    print(pil_images)

    return pil_images