Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -4,37 +4,55 @@ import time
|
|
4 |
import numpy as np
|
5 |
import csv
|
6 |
from pydub import AudioSegment
|
|
|
7 |
import io
|
8 |
|
9 |
|
10 |
-
def
|
11 |
-
|
12 |
if np.issubdtype(audio_array.dtype, np.floating):
|
13 |
max_val = np.max(np.abs(audio_array))
|
14 |
audio_array = (audio_array / max_val) * 32767 # Normalize to 16-bit range
|
15 |
audio_array = audio_array.astype(np.int16)
|
16 |
|
17 |
-
# Create an audio segment from the numpy array
|
18 |
audio_segment = AudioSegment(
|
19 |
audio_array.tobytes(),
|
20 |
frame_rate=sampling_rate,
|
21 |
sample_width=audio_array.dtype.itemsize,
|
22 |
channels=1
|
23 |
)
|
|
|
24 |
|
25 |
-
# Export the audio segment to MP3 bytes
|
26 |
-
mp3_io = io.BytesIO()
|
27 |
-
audio_segment.export(mp3_io, format="mp3", bitrate="320k")
|
28 |
|
29 |
-
|
30 |
-
|
31 |
-
|
|
|
32 |
|
33 |
-
return mp3_bytes
|
34 |
|
|
|
|
|
|
|
|
|
35 |
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
minutes = int(seconds // 60)
|
39 |
secs = int(seconds % 60)
|
40 |
return f"{minutes:02}:{secs:02}"
|
@@ -42,37 +60,34 @@ def format_time(seconds):
|
|
42 |
|
43 |
def stream(audio, chunk_length_s):
|
44 |
sampling_rate, array = audio
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
|
50 |
-
#
|
51 |
formatted_timestamps = []
|
52 |
|
53 |
-
for idx in
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
|
59 |
# Format timestamps
|
60 |
-
|
61 |
-
|
62 |
-
)
|
63 |
-
|
64 |
-
chunk = array[start_pos:end_pos]
|
65 |
-
chunk_mp3 = numpy_to_mp3(chunk, sampling_rate=sampling_rate)
|
66 |
|
67 |
yield chunk_mp3, formatted_timestamps
|
68 |
|
69 |
-
# Save
|
70 |
-
with open("
|
71 |
writer = csv.writer(file)
|
72 |
writer.writerow(["Start Time", "End Time"])
|
73 |
writer.writerows(formatted_timestamps)
|
74 |
|
75 |
-
print(f"Timestamps saved to '
|
76 |
print("Formatted timestamps:")
|
77 |
for start, end in formatted_timestamps:
|
78 |
print(f"{start} to {end}")
|
@@ -82,17 +97,17 @@ with gr.Blocks() as demo:
|
|
82 |
with gr.Row():
|
83 |
with gr.Column():
|
84 |
audio_in = gr.Audio(value="librispeech.wav", sources=["upload"], type="numpy", label="Input Audio")
|
85 |
-
chunk_length = gr.Slider(minimum=
|
86 |
-
run_button = gr.Button("Split
|
87 |
with gr.Column():
|
88 |
audio_out = gr.Audio(streaming=True, autoplay=True, format="mp3", label="Streamed MP3 Audio")
|
89 |
timestamps_output = gr.Dataframe(
|
90 |
headers=["Start Time", "End Time"],
|
91 |
-
label="Audio Chunk Timestamps",
|
92 |
interactive=False
|
93 |
)
|
94 |
|
95 |
-
# Updated function outputs with the
|
96 |
run_button.click(
|
97 |
fn=stream,
|
98 |
inputs=[audio_in, chunk_length],
|
|
|
4 |
import numpy as np
|
5 |
import csv
|
6 |
from pydub import AudioSegment
|
7 |
+
from pydub.silence import detect_nonsilent
|
8 |
import io
|
9 |
|
10 |
|
11 |
+
def numpy_to_audiosegment(audio_array, sampling_rate):
|
12 |
+
"""Converts a NumPy audio array into a Pydub AudioSegment."""
|
13 |
if np.issubdtype(audio_array.dtype, np.floating):
|
14 |
max_val = np.max(np.abs(audio_array))
|
15 |
audio_array = (audio_array / max_val) * 32767 # Normalize to 16-bit range
|
16 |
audio_array = audio_array.astype(np.int16)
|
17 |
|
|
|
18 |
audio_segment = AudioSegment(
|
19 |
audio_array.tobytes(),
|
20 |
frame_rate=sampling_rate,
|
21 |
sample_width=audio_array.dtype.itemsize,
|
22 |
channels=1
|
23 |
)
|
24 |
+
return audio_segment
|
25 |
|
|
|
|
|
|
|
26 |
|
27 |
+
def audiosegment_to_numpy(audio_segment):
|
28 |
+
"""Converts a Pydub AudioSegment back into a NumPy array."""
|
29 |
+
samples = np.array(audio_segment.get_array_of_samples())
|
30 |
+
return samples
|
31 |
|
|
|
32 |
|
33 |
+
def split_audio_on_silence(audio_segment, chunk_length_s, silence_thresh=-40, min_silence_len=500):
|
34 |
+
"""Splits audio into chunks based on silence, each chunk <= chunk_length_s."""
|
35 |
+
max_length = chunk_length_s * 1000 # Convert to milliseconds
|
36 |
+
nonsilent_ranges = detect_nonsilent(audio_segment, min_silence_len=min_silence_len, silence_thresh=silence_thresh)
|
37 |
|
38 |
+
chunks = []
|
39 |
+
start_time = 0
|
40 |
+
|
41 |
+
for start, end in nonsilent_ranges:
|
42 |
+
if end - start > max_length:
|
43 |
+
# Split long nonsilent sections into smaller chunks
|
44 |
+
while start + max_length <= end:
|
45 |
+
chunks.append((start, start + max_length))
|
46 |
+
start += max_length
|
47 |
+
chunks.append((start, end))
|
48 |
+
start_time = end
|
49 |
+
|
50 |
+
return chunks
|
51 |
+
|
52 |
+
|
53 |
+
def format_time(milliseconds):
|
54 |
+
"""Formats time in milliseconds to MM:SS format."""
|
55 |
+
seconds = milliseconds / 1000
|
56 |
minutes = int(seconds // 60)
|
57 |
secs = int(seconds % 60)
|
58 |
return f"{minutes:02}:{secs:02}"
|
|
|
60 |
|
61 |
def stream(audio, chunk_length_s):
|
62 |
sampling_rate, array = audio
|
63 |
+
audio_segment = numpy_to_audiosegment(array, sampling_rate)
|
64 |
+
|
65 |
+
# Split the audio based on silence
|
66 |
+
chunks = split_audio_on_silence(audio_segment, chunk_length_s)
|
67 |
|
68 |
+
# Prepare output data
|
69 |
formatted_timestamps = []
|
70 |
|
71 |
+
for idx, (start, end) in enumerate(chunks):
|
72 |
+
# Extract the audio chunk
|
73 |
+
chunk_segment = audio_segment[start:end]
|
74 |
+
chunk_numpy = audiosegment_to_numpy(chunk_segment)
|
75 |
+
chunk_mp3 = numpy_to_mp3(chunk_numpy, sampling_rate)
|
76 |
|
77 |
# Format timestamps
|
78 |
+
start_time_formatted = format_time(start)
|
79 |
+
end_time_formatted = format_time(end)
|
80 |
+
formatted_timestamps.append((start_time_formatted, end_time_formatted))
|
|
|
|
|
|
|
81 |
|
82 |
yield chunk_mp3, formatted_timestamps
|
83 |
|
84 |
+
# Save timestamps to CSV
|
85 |
+
with open("silence_based_timestamps.csv", mode="w", newline="") as file:
|
86 |
writer = csv.writer(file)
|
87 |
writer.writerow(["Start Time", "End Time"])
|
88 |
writer.writerows(formatted_timestamps)
|
89 |
|
90 |
+
print(f"Timestamps saved to 'silence_based_timestamps.csv'")
|
91 |
print("Formatted timestamps:")
|
92 |
for start, end in formatted_timestamps:
|
93 |
print(f"{start} to {end}")
|
|
|
97 |
with gr.Row():
|
98 |
with gr.Column():
|
99 |
audio_in = gr.Audio(value="librispeech.wav", sources=["upload"], type="numpy", label="Input Audio")
|
100 |
+
chunk_length = gr.Slider(minimum=10, maximum=30, value=30, step=5, label="Max Chunk Length (s)")
|
101 |
+
run_button = gr.Button("Split on Silence")
|
102 |
with gr.Column():
|
103 |
audio_out = gr.Audio(streaming=True, autoplay=True, format="mp3", label="Streamed MP3 Audio")
|
104 |
timestamps_output = gr.Dataframe(
|
105 |
headers=["Start Time", "End Time"],
|
106 |
+
label="Silence-Based Audio Chunk Timestamps",
|
107 |
interactive=False
|
108 |
)
|
109 |
|
110 |
+
# Updated function outputs with the silence-based timestamps
|
111 |
run_button.click(
|
112 |
fn=stream,
|
113 |
inputs=[audio_in, chunk_length],
|