Spaces:
Running
Running
File size: 4,785 Bytes
c2dbc7b 3a7880d c2dbc7b 639604c c2dbc7b 3a7880d 60634e1 c2dbc7b 60634e1 c2dbc7b 3a7880d c2dbc7b 3a7880d c2dbc7b 3a7880d c2dbc7b 3a7880d dff8543 639604c 60634e1 3a7880d c2dbc7b 3a7880d dff8543 c2dbc7b 3a7880d 60634e1 dff8543 3a7880d 60634e1 dff8543 60634e1 3a7880d 616533c dff8543 616533c 3a7880d dff8543 c2dbc7b 639604c c2dbc7b 8dc755e 3a7880d c2dbc7b 8dc755e dff8543 3a7880d dff8543 8dc755e 3a7880d 8dc755e dff8543 8dc755e c2dbc7b 616533c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import gradio as gr
import numpy as np
from pydub import AudioSegment
from pydub.silence import detect_nonsilent
import io
import csv
def numpy_to_audiosegment(audio_array, sampling_rate):
"""Converts a NumPy audio array into a Pydub AudioSegment."""
if np.issubdtype(audio_array.dtype, np.floating):
max_val = np.max(np.abs(audio_array))
audio_array = (audio_array / max_val) * 32767 # Normalize to 16-bit range
audio_array = audio_array.astype(np.int16)
audio_segment = AudioSegment(
audio_array.tobytes(),
frame_rate=sampling_rate,
sample_width=audio_array.dtype.itemsize,
channels=1
)
return audio_segment
def audiosegment_to_numpy(audio_segment):
"""Converts a Pydub AudioSegment back into a NumPy array."""
samples = np.array(audio_segment.get_array_of_samples())
return samples
def split_audio_on_silence(audio_segment, chunk_length_s, silence_thresh=-40, min_silence_len=500):
"""Splits audio into chunks based on silence, each chunk <= chunk_length_s."""
max_length = chunk_length_s * 1000 # Convert to milliseconds
nonsilent_ranges = detect_nonsilent(audio_segment, min_silence_len=min_silence_len, silence_thresh=silence_thresh)
chunks = []
start_time = 0
for start, end in nonsilent_ranges:
if end - start > max_length:
# Split long nonsilent sections into smaller chunks
while start + max_length <= end:
chunks.append((start, start + max_length))
start += max_length
chunks.append((start, end))
start_time = end
return chunks
def format_time(milliseconds):
"""Formats time in milliseconds to MM:SS format."""
seconds = milliseconds / 1000
minutes = int(seconds // 60)
secs = int(seconds % 60)
return f"{minutes:02}:{secs:02}"
def numpy_to_mp3(audio_array, sampling_rate):
"""Converts a numpy audio array to MP3 format."""
# Normalize audio_array if it's floating-point
if np.issubdtype(audio_array.dtype, np.floating):
max_val = np.max(np.abs(audio_array))
audio_array = (audio_array / max_val) * 32767 # Normalize to 16-bit range
audio_array = audio_array.astype(np.int16)
audio_segment = AudioSegment(
audio_array.tobytes(),
frame_rate=sampling_rate,
sample_width=audio_array.dtype.itemsize,
channels=1
)
# Export the audio segment to MP3 bytes
mp3_io = io.BytesIO()
audio_segment.export(mp3_io, format="mp3", bitrate="320k")
mp3_bytes = mp3_io.getvalue()
mp3_io.close()
return mp3_bytes
def stream(audio, chunk_length_s):
sampling_rate, array = audio
audio_segment = numpy_to_audiosegment(array, sampling_rate)
# Split the audio based on silence
chunks = split_audio_on_silence(audio_segment, chunk_length_s)
# Prepare output data
formatted_timestamps = []
for idx, (start, end) in enumerate(chunks):
# Extract the audio chunk
chunk_segment = audio_segment[start:end]
chunk_numpy = audiosegment_to_numpy(chunk_segment)
chunk_mp3 = numpy_to_mp3(chunk_numpy, sampling_rate)
# Format timestamps
start_time_formatted = format_time(start)
end_time_formatted = format_time(end)
formatted_timestamps.append((start_time_formatted, end_time_formatted))
yield chunk_mp3, formatted_timestamps
# Save timestamps to CSV
with open("silence_based_timestamps.csv", mode="w", newline="") as file:
writer = csv.writer(file)
writer.writerow(["Start Time", "End Time"])
writer.writerows(formatted_timestamps)
print(f"Timestamps saved to 'silence_based_timestamps.csv'")
print("Formatted timestamps:")
for start, end in formatted_timestamps:
print(f"{start} to {end}")
# Gradio Interface
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
audio_in = gr.Audio(value="librispeech.wav", sources=["upload"], type="numpy", label="Input Audio")
chunk_length = gr.Slider(minimum=10, maximum=30, value=30, step=5, label="Max Chunk Length (s)")
run_button = gr.Button("Split on Silence")
with gr.Column():
audio_out = gr.Audio(streaming=True, autoplay=True, format="mp3", label="Streamed MP3 Audio")
timestamps_output = gr.Dataframe(
headers=["Start Time", "End Time"],
label="Silence-Based Audio Chunk Timestamps",
interactive=False
)
# Updated function outputs with the silence-based timestamps
run_button.click(
fn=stream,
inputs=[audio_in, chunk_length],
outputs=[audio_out, timestamps_output]
)
demo.launch()
|