File size: 4,785 Bytes
c2dbc7b
 
 
3a7880d
c2dbc7b
639604c
c2dbc7b
 
3a7880d
 
60634e1
 
 
 
c2dbc7b
 
60634e1
 
 
c2dbc7b
 
3a7880d
c2dbc7b
 
3a7880d
 
 
 
c2dbc7b
 
3a7880d
 
 
 
c2dbc7b
3a7880d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dff8543
 
 
 
 
639604c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60634e1
 
3a7880d
 
 
 
c2dbc7b
3a7880d
dff8543
c2dbc7b
3a7880d
 
 
 
 
60634e1
dff8543
3a7880d
 
 
60634e1
dff8543
60634e1
3a7880d
 
616533c
dff8543
 
616533c
3a7880d
dff8543
 
 
c2dbc7b
 
639604c
c2dbc7b
 
 
8dc755e
3a7880d
 
c2dbc7b
8dc755e
dff8543
 
3a7880d
dff8543
 
8dc755e
3a7880d
8dc755e
 
 
dff8543
8dc755e
c2dbc7b
616533c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import gradio as gr
import numpy as np
from pydub import AudioSegment
from pydub.silence import detect_nonsilent
import io
import csv


def numpy_to_audiosegment(audio_array, sampling_rate):
    """Converts a NumPy audio array into a Pydub AudioSegment."""
    if np.issubdtype(audio_array.dtype, np.floating):
        max_val = np.max(np.abs(audio_array))
        audio_array = (audio_array / max_val) * 32767  # Normalize to 16-bit range
        audio_array = audio_array.astype(np.int16)

    audio_segment = AudioSegment(
        audio_array.tobytes(),
        frame_rate=sampling_rate,
        sample_width=audio_array.dtype.itemsize,
        channels=1
    )
    return audio_segment


def audiosegment_to_numpy(audio_segment):
    """Converts a Pydub AudioSegment back into a NumPy array."""
    samples = np.array(audio_segment.get_array_of_samples())
    return samples


def split_audio_on_silence(audio_segment, chunk_length_s, silence_thresh=-40, min_silence_len=500):
    """Splits audio into chunks based on silence, each chunk <= chunk_length_s."""
    max_length = chunk_length_s * 1000  # Convert to milliseconds
    nonsilent_ranges = detect_nonsilent(audio_segment, min_silence_len=min_silence_len, silence_thresh=silence_thresh)

    chunks = []
    start_time = 0

    for start, end in nonsilent_ranges:
        if end - start > max_length:
            # Split long nonsilent sections into smaller chunks
            while start + max_length <= end:
                chunks.append((start, start + max_length))
                start += max_length
        chunks.append((start, end))
        start_time = end

    return chunks


def format_time(milliseconds):
    """Formats time in milliseconds to MM:SS format."""
    seconds = milliseconds / 1000
    minutes = int(seconds // 60)
    secs = int(seconds % 60)
    return f"{minutes:02}:{secs:02}"


def numpy_to_mp3(audio_array, sampling_rate):
    """Converts a numpy audio array to MP3 format."""
    # Normalize audio_array if it's floating-point
    if np.issubdtype(audio_array.dtype, np.floating):
        max_val = np.max(np.abs(audio_array))
        audio_array = (audio_array / max_val) * 32767  # Normalize to 16-bit range
        audio_array = audio_array.astype(np.int16)

    audio_segment = AudioSegment(
        audio_array.tobytes(),
        frame_rate=sampling_rate,
        sample_width=audio_array.dtype.itemsize,
        channels=1
    )

    # Export the audio segment to MP3 bytes
    mp3_io = io.BytesIO()
    audio_segment.export(mp3_io, format="mp3", bitrate="320k")
    mp3_bytes = mp3_io.getvalue()
    mp3_io.close()

    return mp3_bytes


def stream(audio, chunk_length_s):
    sampling_rate, array = audio
    audio_segment = numpy_to_audiosegment(array, sampling_rate)

    # Split the audio based on silence
    chunks = split_audio_on_silence(audio_segment, chunk_length_s)

    # Prepare output data
    formatted_timestamps = []

    for idx, (start, end) in enumerate(chunks):
        # Extract the audio chunk
        chunk_segment = audio_segment[start:end]
        chunk_numpy = audiosegment_to_numpy(chunk_segment)
        chunk_mp3 = numpy_to_mp3(chunk_numpy, sampling_rate)

        # Format timestamps
        start_time_formatted = format_time(start)
        end_time_formatted = format_time(end)
        formatted_timestamps.append((start_time_formatted, end_time_formatted))

        yield chunk_mp3, formatted_timestamps

    # Save timestamps to CSV
    with open("silence_based_timestamps.csv", mode="w", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(["Start Time", "End Time"])
        writer.writerows(formatted_timestamps)

    print(f"Timestamps saved to 'silence_based_timestamps.csv'")
    print("Formatted timestamps:")
    for start, end in formatted_timestamps:
        print(f"{start} to {end}")


# Gradio Interface
with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            audio_in = gr.Audio(value="librispeech.wav", sources=["upload"], type="numpy", label="Input Audio")
            chunk_length = gr.Slider(minimum=10, maximum=30, value=30, step=5, label="Max Chunk Length (s)")
            run_button = gr.Button("Split on Silence")
        with gr.Column():
            audio_out = gr.Audio(streaming=True, autoplay=True, format="mp3", label="Streamed MP3 Audio")
            timestamps_output = gr.Dataframe(
                headers=["Start Time", "End Time"],
                label="Silence-Based Audio Chunk Timestamps",
                interactive=False
            )

    # Updated function outputs with the silence-based timestamps
    run_button.click(
        fn=stream,
        inputs=[audio_in, chunk_length],
        outputs=[audio_out, timestamps_output]
    )

demo.launch()