maliahson commited on
Commit
3a7880d
·
verified ·
1 Parent(s): dff8543

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -35
app.py CHANGED
@@ -4,37 +4,55 @@ import time
4
  import numpy as np
5
  import csv
6
  from pydub import AudioSegment
 
7
  import io
8
 
9
 
10
- def numpy_to_mp3(audio_array, sampling_rate):
11
- # Normalize audio_array if it's floating-point
12
  if np.issubdtype(audio_array.dtype, np.floating):
13
  max_val = np.max(np.abs(audio_array))
14
  audio_array = (audio_array / max_val) * 32767 # Normalize to 16-bit range
15
  audio_array = audio_array.astype(np.int16)
16
 
17
- # Create an audio segment from the numpy array
18
  audio_segment = AudioSegment(
19
  audio_array.tobytes(),
20
  frame_rate=sampling_rate,
21
  sample_width=audio_array.dtype.itemsize,
22
  channels=1
23
  )
 
24
 
25
- # Export the audio segment to MP3 bytes
26
- mp3_io = io.BytesIO()
27
- audio_segment.export(mp3_io, format="mp3", bitrate="320k")
28
 
29
- # Get the MP3 bytes
30
- mp3_bytes = mp3_io.getvalue()
31
- mp3_io.close()
 
32
 
33
- return mp3_bytes
34
 
 
 
 
 
35
 
36
- def format_time(seconds):
37
- """Formats time in seconds to MM:SS format."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  minutes = int(seconds // 60)
39
  secs = int(seconds % 60)
40
  return f"{minutes:02}:{secs:02}"
@@ -42,37 +60,34 @@ def format_time(seconds):
42
 
43
  def stream(audio, chunk_length_s):
44
  sampling_rate, array = audio
45
- chunk_length = int(chunk_length_s * sampling_rate)
46
- audio_length = len(array)
47
- total_duration = audio_length / sampling_rate
48
- num_batches = math.ceil(audio_length / chunk_length)
49
 
50
- # Initialize a list to store formatted timestamps
51
  formatted_timestamps = []
52
 
53
- for idx in range(num_batches):
54
- start_pos = idx * chunk_length
55
- end_pos = min((idx + 1) * chunk_length, audio_length)
56
- chunk_start_time = start_pos / sampling_rate
57
- chunk_end_time = end_pos / sampling_rate
58
 
59
  # Format timestamps
60
- formatted_timestamps.append(
61
- (format_time(chunk_start_time), format_time(chunk_end_time))
62
- )
63
-
64
- chunk = array[start_pos:end_pos]
65
- chunk_mp3 = numpy_to_mp3(chunk, sampling_rate=sampling_rate)
66
 
67
  yield chunk_mp3, formatted_timestamps
68
 
69
- # Save the formatted timestamps to a CSV file
70
- with open("audio_chunk_timestamps.csv", mode="w", newline="") as file:
71
  writer = csv.writer(file)
72
  writer.writerow(["Start Time", "End Time"])
73
  writer.writerows(formatted_timestamps)
74
 
75
- print(f"Timestamps saved to 'audio_chunk_timestamps.csv'")
76
  print("Formatted timestamps:")
77
  for start, end in formatted_timestamps:
78
  print(f"{start} to {end}")
@@ -82,17 +97,17 @@ with gr.Blocks() as demo:
82
  with gr.Row():
83
  with gr.Column():
84
  audio_in = gr.Audio(value="librispeech.wav", sources=["upload"], type="numpy", label="Input Audio")
85
- chunk_length = gr.Slider(minimum=5, maximum=30, value=30, step=5, label="Chunk length (s)")
86
- run_button = gr.Button("Split and Stream Audio")
87
  with gr.Column():
88
  audio_out = gr.Audio(streaming=True, autoplay=True, format="mp3", label="Streamed MP3 Audio")
89
  timestamps_output = gr.Dataframe(
90
  headers=["Start Time", "End Time"],
91
- label="Audio Chunk Timestamps",
92
  interactive=False
93
  )
94
 
95
- # Updated function outputs with the formatted timestamps table
96
  run_button.click(
97
  fn=stream,
98
  inputs=[audio_in, chunk_length],
 
4
  import numpy as np
5
  import csv
6
  from pydub import AudioSegment
7
+ from pydub.silence import detect_nonsilent
8
  import io
9
 
10
 
11
+ def numpy_to_audiosegment(audio_array, sampling_rate):
12
+ """Converts a NumPy audio array into a Pydub AudioSegment."""
13
  if np.issubdtype(audio_array.dtype, np.floating):
14
  max_val = np.max(np.abs(audio_array))
15
  audio_array = (audio_array / max_val) * 32767 # Normalize to 16-bit range
16
  audio_array = audio_array.astype(np.int16)
17
 
 
18
  audio_segment = AudioSegment(
19
  audio_array.tobytes(),
20
  frame_rate=sampling_rate,
21
  sample_width=audio_array.dtype.itemsize,
22
  channels=1
23
  )
24
+ return audio_segment
25
 
 
 
 
26
 
27
+ def audiosegment_to_numpy(audio_segment):
28
+ """Converts a Pydub AudioSegment back into a NumPy array."""
29
+ samples = np.array(audio_segment.get_array_of_samples())
30
+ return samples
31
 
 
32
 
33
+ def split_audio_on_silence(audio_segment, chunk_length_s, silence_thresh=-40, min_silence_len=500):
34
+ """Splits audio into chunks based on silence, each chunk <= chunk_length_s."""
35
+ max_length = chunk_length_s * 1000 # Convert to milliseconds
36
+ nonsilent_ranges = detect_nonsilent(audio_segment, min_silence_len=min_silence_len, silence_thresh=silence_thresh)
37
 
38
+ chunks = []
39
+ start_time = 0
40
+
41
+ for start, end in nonsilent_ranges:
42
+ if end - start > max_length:
43
+ # Split long nonsilent sections into smaller chunks
44
+ while start + max_length <= end:
45
+ chunks.append((start, start + max_length))
46
+ start += max_length
47
+ chunks.append((start, end))
48
+ start_time = end
49
+
50
+ return chunks
51
+
52
+
53
+ def format_time(milliseconds):
54
+ """Formats time in milliseconds to MM:SS format."""
55
+ seconds = milliseconds / 1000
56
  minutes = int(seconds // 60)
57
  secs = int(seconds % 60)
58
  return f"{minutes:02}:{secs:02}"
 
60
 
61
  def stream(audio, chunk_length_s):
62
  sampling_rate, array = audio
63
+ audio_segment = numpy_to_audiosegment(array, sampling_rate)
64
+
65
+ # Split the audio based on silence
66
+ chunks = split_audio_on_silence(audio_segment, chunk_length_s)
67
 
68
+ # Prepare output data
69
  formatted_timestamps = []
70
 
71
+ for idx, (start, end) in enumerate(chunks):
72
+ # Extract the audio chunk
73
+ chunk_segment = audio_segment[start:end]
74
+ chunk_numpy = audiosegment_to_numpy(chunk_segment)
75
+ chunk_mp3 = numpy_to_mp3(chunk_numpy, sampling_rate)
76
 
77
  # Format timestamps
78
+ start_time_formatted = format_time(start)
79
+ end_time_formatted = format_time(end)
80
+ formatted_timestamps.append((start_time_formatted, end_time_formatted))
 
 
 
81
 
82
  yield chunk_mp3, formatted_timestamps
83
 
84
+ # Save timestamps to CSV
85
+ with open("silence_based_timestamps.csv", mode="w", newline="") as file:
86
  writer = csv.writer(file)
87
  writer.writerow(["Start Time", "End Time"])
88
  writer.writerows(formatted_timestamps)
89
 
90
+ print(f"Timestamps saved to 'silence_based_timestamps.csv'")
91
  print("Formatted timestamps:")
92
  for start, end in formatted_timestamps:
93
  print(f"{start} to {end}")
 
97
  with gr.Row():
98
  with gr.Column():
99
  audio_in = gr.Audio(value="librispeech.wav", sources=["upload"], type="numpy", label="Input Audio")
100
+ chunk_length = gr.Slider(minimum=10, maximum=30, value=30, step=5, label="Max Chunk Length (s)")
101
+ run_button = gr.Button("Split on Silence")
102
  with gr.Column():
103
  audio_out = gr.Audio(streaming=True, autoplay=True, format="mp3", label="Streamed MP3 Audio")
104
  timestamps_output = gr.Dataframe(
105
  headers=["Start Time", "End Time"],
106
+ label="Silence-Based Audio Chunk Timestamps",
107
  interactive=False
108
  )
109
 
110
+ # Updated function outputs with the silence-based timestamps
111
  run_button.click(
112
  fn=stream,
113
  inputs=[audio_in, chunk_length],