import gradio as gr import numpy as np from pydub import AudioSegment from pydub.silence import detect_nonsilent import io import csv def numpy_to_audiosegment(audio_array, sampling_rate): """Converts a NumPy audio array into a Pydub AudioSegment.""" if np.issubdtype(audio_array.dtype, np.floating): max_val = np.max(np.abs(audio_array)) audio_array = (audio_array / max_val) * 32767 # Normalize to 16-bit range audio_array = audio_array.astype(np.int16) audio_segment = AudioSegment( audio_array.tobytes(), frame_rate=sampling_rate, sample_width=audio_array.dtype.itemsize, channels=1 ) return audio_segment def audiosegment_to_numpy(audio_segment): """Converts a Pydub AudioSegment back into a NumPy array.""" samples = np.array(audio_segment.get_array_of_samples()) return samples def split_audio_on_silence(audio_segment, chunk_length_s, silence_thresh=-40, min_silence_len=500): """Splits audio into chunks based on silence, each chunk <= chunk_length_s.""" max_length = chunk_length_s * 1000 # Convert to milliseconds nonsilent_ranges = detect_nonsilent(audio_segment, min_silence_len=min_silence_len, silence_thresh=silence_thresh) chunks = [] start_time = 0 for start, end in nonsilent_ranges: if end - start > max_length: # Split long nonsilent sections into smaller chunks while start + max_length <= end: chunks.append((start, start + max_length)) start += max_length chunks.append((start, end)) start_time = end return chunks def format_time(milliseconds): """Formats time in milliseconds to MM:SS format.""" seconds = milliseconds / 1000 minutes = int(seconds // 60) secs = int(seconds % 60) return f"{minutes:02}:{secs:02}" def numpy_to_mp3(audio_array, sampling_rate): """Converts a numpy audio array to MP3 format.""" # Normalize audio_array if it's floating-point if np.issubdtype(audio_array.dtype, np.floating): max_val = np.max(np.abs(audio_array)) audio_array = (audio_array / max_val) * 32767 # Normalize to 16-bit range audio_array = audio_array.astype(np.int16) audio_segment = AudioSegment( audio_array.tobytes(), frame_rate=sampling_rate, sample_width=audio_array.dtype.itemsize, channels=1 ) # Export the audio segment to MP3 bytes mp3_io = io.BytesIO() audio_segment.export(mp3_io, format="mp3", bitrate="320k") mp3_bytes = mp3_io.getvalue() mp3_io.close() return mp3_bytes def stream(audio, chunk_length_s): sampling_rate, array = audio audio_segment = numpy_to_audiosegment(array, sampling_rate) # Split the audio based on silence chunks = split_audio_on_silence(audio_segment, chunk_length_s) # Prepare output data formatted_timestamps = [] for idx, (start, end) in enumerate(chunks): # Extract the audio chunk chunk_segment = audio_segment[start:end] chunk_numpy = audiosegment_to_numpy(chunk_segment) chunk_mp3 = numpy_to_mp3(chunk_numpy, sampling_rate) # Format timestamps start_time_formatted = format_time(start) end_time_formatted = format_time(end) formatted_timestamps.append((start_time_formatted, end_time_formatted)) yield chunk_mp3, formatted_timestamps # Save timestamps to CSV with open("silence_based_timestamps.csv", mode="w", newline="") as file: writer = csv.writer(file) writer.writerow(["Start Time", "End Time"]) writer.writerows(formatted_timestamps) print(f"Timestamps saved to 'silence_based_timestamps.csv'") print("Formatted timestamps:") for start, end in formatted_timestamps: print(f"{start} to {end}") # Gradio Interface with gr.Blocks() as demo: with gr.Row(): with gr.Column(): audio_in = gr.Audio(value="librispeech.wav", sources=["upload"], type="numpy", label="Input Audio") chunk_length = gr.Slider(minimum=10, maximum=30, value=30, step=5, label="Max Chunk Length (s)") run_button = gr.Button("Split on Silence") with gr.Column(): audio_out = gr.Audio(streaming=True, autoplay=True, format="mp3", label="Streamed MP3 Audio") timestamps_output = gr.Dataframe( headers=["Start Time", "End Time"], label="Silence-Based Audio Chunk Timestamps", interactive=False ) # Updated function outputs with the silence-based timestamps run_button.click( fn=stream, inputs=[audio_in, chunk_length], outputs=[audio_out, timestamps_output] ) demo.launch()