Spaces:
Running
Running
import gradio as gr | |
import numpy as np | |
from pydub import AudioSegment | |
from pydub.silence import detect_nonsilent | |
import io | |
import csv | |
def numpy_to_audiosegment(audio_array, sampling_rate): | |
"""Converts a NumPy audio array into a Pydub AudioSegment.""" | |
if np.issubdtype(audio_array.dtype, np.floating): | |
max_val = np.max(np.abs(audio_array)) | |
audio_array = (audio_array / max_val) * 32767 # Normalize to 16-bit range | |
audio_array = audio_array.astype(np.int16) | |
audio_segment = AudioSegment( | |
audio_array.tobytes(), | |
frame_rate=sampling_rate, | |
sample_width=audio_array.dtype.itemsize, | |
channels=1 | |
) | |
return audio_segment | |
def audiosegment_to_numpy(audio_segment): | |
"""Converts a Pydub AudioSegment back into a NumPy array.""" | |
samples = np.array(audio_segment.get_array_of_samples()) | |
return samples | |
def split_audio_on_silence(audio_segment, chunk_length_s, silence_thresh=-40, min_silence_len=500): | |
"""Splits audio into chunks based on silence, each chunk <= chunk_length_s.""" | |
max_length = chunk_length_s * 1000 # Convert to milliseconds | |
nonsilent_ranges = detect_nonsilent(audio_segment, min_silence_len=min_silence_len, silence_thresh=silence_thresh) | |
chunks = [] | |
start_time = 0 | |
for start, end in nonsilent_ranges: | |
if end - start > max_length: | |
# Split long nonsilent sections into smaller chunks | |
while start + max_length <= end: | |
chunks.append((start, start + max_length)) | |
start += max_length | |
chunks.append((start, end)) | |
start_time = end | |
return chunks | |
def format_time(milliseconds): | |
"""Formats time in milliseconds to MM:SS format.""" | |
seconds = milliseconds / 1000 | |
minutes = int(seconds // 60) | |
secs = int(seconds % 60) | |
return f"{minutes:02}:{secs:02}" | |
def numpy_to_mp3(audio_array, sampling_rate): | |
"""Converts a numpy audio array to MP3 format.""" | |
# Normalize audio_array if it's floating-point | |
if np.issubdtype(audio_array.dtype, np.floating): | |
max_val = np.max(np.abs(audio_array)) | |
audio_array = (audio_array / max_val) * 32767 # Normalize to 16-bit range | |
audio_array = audio_array.astype(np.int16) | |
audio_segment = AudioSegment( | |
audio_array.tobytes(), | |
frame_rate=sampling_rate, | |
sample_width=audio_array.dtype.itemsize, | |
channels=1 | |
) | |
# Export the audio segment to MP3 bytes | |
mp3_io = io.BytesIO() | |
audio_segment.export(mp3_io, format="mp3", bitrate="320k") | |
mp3_bytes = mp3_io.getvalue() | |
mp3_io.close() | |
return mp3_bytes | |
def stream(audio, chunk_length_s): | |
sampling_rate, array = audio | |
audio_segment = numpy_to_audiosegment(array, sampling_rate) | |
# Split the audio based on silence | |
chunks = split_audio_on_silence(audio_segment, chunk_length_s) | |
# Prepare output data | |
formatted_timestamps = [] | |
for idx, (start, end) in enumerate(chunks): | |
# Extract the audio chunk | |
chunk_segment = audio_segment[start:end] | |
chunk_numpy = audiosegment_to_numpy(chunk_segment) | |
chunk_mp3 = numpy_to_mp3(chunk_numpy, sampling_rate) | |
# Format timestamps | |
start_time_formatted = format_time(start) | |
end_time_formatted = format_time(end) | |
formatted_timestamps.append((start_time_formatted, end_time_formatted)) | |
yield chunk_mp3, formatted_timestamps | |
# Save timestamps to CSV | |
with open("silence_based_timestamps.csv", mode="w", newline="") as file: | |
writer = csv.writer(file) | |
writer.writerow(["Start Time", "End Time"]) | |
writer.writerows(formatted_timestamps) | |
print(f"Timestamps saved to 'silence_based_timestamps.csv'") | |
print("Formatted timestamps:") | |
for start, end in formatted_timestamps: | |
print(f"{start} to {end}") | |
# Gradio Interface | |
with gr.Blocks() as demo: | |
with gr.Row(): | |
with gr.Column(): | |
audio_in = gr.Audio(value="librispeech.wav", sources=["upload"], type="numpy", label="Input Audio") | |
chunk_length = gr.Slider(minimum=10, maximum=30, value=30, step=5, label="Max Chunk Length (s)") | |
run_button = gr.Button("Split on Silence") | |
with gr.Column(): | |
audio_out = gr.Audio(streaming=True, autoplay=True, format="mp3", label="Streamed MP3 Audio") | |
timestamps_output = gr.Dataframe( | |
headers=["Start Time", "End Time"], | |
label="Silence-Based Audio Chunk Timestamps", | |
interactive=False | |
) | |
# Updated function outputs with the silence-based timestamps | |
run_button.click( | |
fn=stream, | |
inputs=[audio_in, chunk_length], | |
outputs=[audio_out, timestamps_output] | |
) | |
demo.launch() | |