cnph001 commited on
Commit
3253b38
·
verified ·
1 Parent(s): 8cebcbb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +163 -222
app.py CHANGED
@@ -7,38 +7,6 @@ import os
7
  import re
8
  from pathlib import Path
9
  from pydub import AudioSegment
10
- import librosa
11
- import soundfile as sf
12
- import numpy as np
13
- from pydub import AudioSegment
14
- from pydub.playback import play
15
- from scipy.signal import butter, lfilter # Ensure this line is present
16
-
17
-
18
- def apply_low_pass_filter(audio_segment, cutoff_freq, sample_rate, order=5):
19
- """Applies a low-pass filter to a pydub AudioSegment."""
20
- audio_np = np.array(audio_segment.get_array_of_samples()).astype(np.float32) / (2**15 - 1)
21
- if audio_segment.channels == 2:
22
- audio_np = audio_np.reshape(-1, 2)
23
-
24
- nyquist_freq = 0.5 * sample_rate
25
- normalized_cutoff = cutoff_freq / nyquist_freq
26
- b, a = butter(order, normalized_cutoff, btype='low', analog=False)
27
-
28
- filtered_data = np.zeros_like(audio_np, dtype=np.float32)
29
- if audio_segment.channels == 1:
30
- filtered_data = lfilter(b, a, audio_np)
31
- else:
32
- for channel in range(audio_segment.channels):
33
- filtered_data[:, channel] = lfilter(b, a, audio_np[:, channel])
34
-
35
- filtered_data_int16 = (filtered_data * (2**15 - 1)).astype(np.int16)
36
- filtered_audio = AudioSegment(filtered_data_int16.tobytes(),
37
- frame_rate=sample_rate,
38
- sample_width=audio_segment.sample_width,
39
- channels=audio_segment.channels)
40
- return filtered_audio
41
-
42
 
43
  def get_silence(duration_ms=1000):
44
  # Create silent audio segment with specified parameters
@@ -46,9 +14,11 @@ def get_silence(duration_ms=1000):
46
  duration=duration_ms,
47
  frame_rate=24000 # 24kHz sampling rate
48
  )
 
49
  # Set audio parameters
50
  silent_audio = silent_audio.set_channels(1) # Mono
51
  silent_audio = silent_audio.set_sample_width(4) # 32-bit (4 bytes per sample)
 
52
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
53
  # Export with specific bitrate and codec parameters
54
  silent_audio.export(
@@ -66,212 +36,193 @@ def get_silence(duration_ms=1000):
66
 
67
  # Get all available voices
68
  async def get_voices():
69
- try:
70
- voices = await edge_tts.list_voices()
71
- return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
72
- except Exception as e:
73
- print(f"Error listing voices: {e}")
74
- return {}
75
 
76
- async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch, overall_target_duration_ms=None, speed_adjustment_factor=1.0):
77
- """Generates audio for a text segment, handling voice prefixes and adjusting rate for duration."""
78
  current_voice_full = default_voice
79
  current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
80
  current_rate = rate
81
  current_pitch = pitch
82
  processed_text = text_segment.strip()
83
- #print(f"Processing this text segment: '{processed_text}'") # Debug
84
- voice_map = {
85
- "1F": "en-GB-SoniaNeural",
86
- "2M": "en-GB-RyanNeural",
87
- "3M": "en-US-BrianMultilingualNeural",
88
- "2F": "en-US-JennyNeural",
89
- "1M": "en-AU-WilliamNeural",
90
- "3F": "en-HK-YanNeural",
91
- "4M": "en-GB-ThomasNeural",
92
- "4F": "en-US-EmmaNeural",
93
- "1O": "en-GB-RyanNeural", # Old Man
94
- "1C": "en-GB-MaisieNeural", # Child
95
- "1V": "vi-VN-HoaiMyNeural", # Vietnamese (Female)
96
- "2V": "vi-VN-NamMinhNeural", # Vietnamese (Male)
97
- "3V": "vi-VN-HoaiMyNeural", # Vietnamese (Female)
98
- "4V": "vi-VN-NamMinhNeural", # Vietnamese (Male)
99
- }
100
- detect = 0
101
- #iterate throught the voice map to see if a match if found, if found then set the voice
102
- for prefix, voice_short in voice_map.items():
103
- if processed_text.startswith(prefix):
104
- current_voice_short = voice_short
105
- if prefix in ["1F", "3F", "1V", "3V"]:
106
- current_pitch = 0
107
- elif prefix in ["1O", "4V"]:
108
- current_pitch = -20
109
- current_rate = -10
110
- detect = 1
111
- processed_text = processed_text[len(prefix):].strip() #this removes the Prefix and leave only number or text after it.
112
- break
113
- #match = re.search(r'([A-Za-z]+)-?(\d+)', processed_text)
114
- match = re.search(r"^(-?\d+)\s*(.*)", processed_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  if match:
116
- #prefix_pitch = match.group(1)
117
- number = match.group(1)
118
- print(f"Prefix match found.") # Debug
119
- current_pitch += int(number)
120
- #processed_text = re.sub(r'[A-Za-z]+-?\d+', '', processed_text, count=1).strip()
121
- #processed_text = re.sub(r'([A-Za-z]+)([-]?\d*)', '', processed_text, count=1).strip()
122
- processed_text = match.group(2)
123
- #elif detect:
124
- # processed_text = processed_text.lstrip('-0123456789').strip() # Remove potential leftover numbers
125
-
 
126
  if processed_text:
127
  rate_str = f"{current_rate:+d}%"
128
  pitch_str = f"{current_pitch:+d}Hz"
129
- print(f"Sending to Edge: '{processed_text}'") # Debug
130
- try:
131
- communicate = edge_tts.Communicate(processed_text, current_voice_short, rate=rate_str, pitch=pitch_str)
132
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
133
- audio_path = tmp_file.name
134
- await communicate.save(audio_path)
135
-
136
- if os.path.exists(audio_path):
137
- audio = AudioSegment.from_mp3(audio_path)
138
- # Trim leading and trailing silence
139
- def detect_leading_silence(sound, silence_threshold=-50.0, chunk_size=10):
140
- trim_ms = 0
141
- assert chunk_size > 0 # to avoid infinite loop
142
- while sound[trim_ms:trim_ms+chunk_size].dBFS < silence_threshold and trim_ms < len(sound):
143
- trim_ms += chunk_size
144
- return trim_ms
145
-
146
- start_trim = detect_leading_silence(audio)
147
- end_trim = detect_leading_silence(audio.reverse())
148
- trimmed_audio = audio[start_trim:len(audio)-end_trim]
149
- trimmed_audio.export(audio_path, format="mp3") # Overwrite with trimmed version
150
- return audio_path
151
-
152
- except Exception as e:
153
- print(f"Edge TTS error processing '{processed_text}': {e}")
154
- return None
155
  return None
156
 
157
- async def process_transcript_line(line, next_line_start_time, default_voice, rate, pitch, overall_duration_ms, speed_adjustment_factor):
158
- """Processes a single transcript line with HH:MM:SS,milliseconds timestamp."""
159
- match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+(.*)', line)
160
  if match:
161
- start_h, start_m, start_s, start_ms, text_parts = match.groups()
162
  start_time_ms = (
163
- int(start_h) * 3600000 +
164
- int(start_m) * 60000 +
165
- int(start_s) * 1000 +
166
- int(start_ms)
167
  )
168
  audio_segments = []
169
- split_parts = re.split(r'[“”"]', text_parts)
 
170
  process_next = False
171
  for part in split_parts:
172
  if part == '"':
173
  process_next = not process_next
174
  continue
175
  if process_next and part.strip():
176
- audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, overall_duration_ms, speed_adjustment_factor)
177
  if audio_path:
178
  audio_segments.append(audio_path)
179
  elif not process_next and part.strip():
180
- audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, overall_duration_ms, speed_adjustment_factor)
181
  if audio_path:
182
  audio_segments.append(audio_path)
183
 
184
- if audio_segments:
185
- combined_audio = AudioSegment.empty()
186
- for segment_path in audio_segments:
187
- try:
188
- segment = AudioSegment.from_mp3(segment_path)
189
- combined_audio += segment
190
- os.remove(segment_path) # Clean up individual segment files
191
- except Exception as e:
192
- print(f"Error loading or combining audio segment {segment_path}: {e}")
193
- return None, None, None
194
-
195
- combined_audio_path = f"combined_audio_{start_time_ms}.mp3"
196
- try:
197
- combined_audio.export(combined_audio_path, format="mp3")
198
- return start_time_ms, [combined_audio_path], overall_duration_ms
199
- except Exception as e:
200
- print(f"Error exporting combined audio: {e}")
201
- return None, None, None
202
-
203
- return start_time_ms, [], overall_duration_ms # Return empty list if no audio generated
204
 
205
- return None, None, None
206
-
207
- async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjustment_factor):
208
  if not transcript_text.strip():
209
  return None, gr.Warning("Please enter transcript text.")
210
  if not voice:
211
  return None, gr.Warning("Please select a voice.")
 
212
  lines = transcript_text.strip().split('\n')
213
  timed_audio_segments = []
214
  max_end_time_ms = 0
215
 
216
- for i, line in enumerate(lines):
217
- next_line_start_time = None
218
- if i < len(lines) - 1:
219
- next_line_match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+.*', lines[i+1])
220
- if next_line_match:
221
- nh, nm, ns, nms = next_line_match.groups()
222
- next_line_start_time = (
223
- int(nh) * 3600000 +
224
- int(nm) * 60000 +
225
- int(ns) * 1000 +
226
- int(nms)
227
- )
228
-
229
- current_line_match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+(.*)', line)
230
- if current_line_match:
231
- sh, sm, ss, sms, text_content = current_line_match.groups()
232
- start_time_ms = (
233
- int(sh) * 3600000 +
234
- int(sm) * 60000 +
235
- int(ss) * 1000 +
236
- int(sms)
237
- )
238
- overall_duration_ms = None
239
- if next_line_start_time is not None:
240
- overall_duration_ms = next_line_start_time - start_time_ms
241
-
242
- start_time, audio_paths, duration = await process_transcript_line(line, next_line_start_time, voice, rate, pitch, overall_duration_ms, speed_adjustment_factor)
243
-
244
- if start_time is not None and audio_paths:
245
- combined_line_audio = AudioSegment.empty()
246
- total_generated_duration_ms = 0
247
- for path in audio_paths:
248
- if path:
249
- try:
250
- audio = AudioSegment.from_mp3(path)
251
- combined_line_audio += audio
252
- total_generated_duration_ms += len(audio)
253
- os.remove(path)
254
- except FileNotFoundError:
255
- print(f"Warning: Audio file not found: {path}")
256
-
257
- if combined_line_audio and overall_duration_ms is not None and overall_duration_ms > 0 and total_generated_duration_ms > overall_duration_ms:
258
- speed_factor = (total_generated_duration_ms / overall_duration_ms) * speed_adjustment_factor
259
- if speed_factor > 0:
260
- if speed_factor < 1.0:
261
- speed_factor = 1.0
262
- combined_line_audio = combined_line_audio.speedup(playback_speed=speed_factor)
263
-
264
- if combined_line_audio:
265
- timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
266
- max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
267
-
268
- elif audio_paths:
269
- for path in audio_paths:
270
- if path:
271
- try:
272
- os.remove(path)
273
- except FileNotFoundError:
274
- pass # Clean up even if no timestamp
275
 
276
  if not timed_audio_segments:
277
  return None, "No processable audio segments found."
@@ -279,35 +230,26 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjust
279
  final_audio = AudioSegment.silent(duration=max_end_time_ms, frame_rate=24000)
280
  for segment in timed_audio_segments:
281
  final_audio = final_audio.overlay(segment['audio'], position=segment['start'])
282
-
283
- # Apply the low-pass filter here
284
- cutoff_frequency = 3500 # 3.5 kHz (you can make this a user-configurable parameter later)
285
- filtered_final_audio = apply_low_pass_filter(final_audio, cutoff_frequency, final_audio.frame_rate)
286
 
287
  combined_audio_path = tempfile.mktemp(suffix=".mp3")
288
- # Export the *filtered* audio here
289
- filtered_final_audio.export(combined_audio_path, format="mp3")
290
  return combined_audio_path, None
291
 
292
  @spaces.GPU
293
- def tts_interface(transcript, voice, rate, pitch, speed_adjustment_factor):
294
- audio, warning = asyncio.run(transcript_to_speech(transcript, voice, rate, pitch, speed_adjustment_factor))
295
  return audio, warning
296
 
297
  async def create_demo():
298
  voices = await get_voices()
299
  default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
300
  description = """
301
- Process timestamped text (HH:MM:SS,milliseconds) with voice changes within quotes.
302
- The duration for each line is determined by the timestamp of the following line.
303
- The speed of the ENTIRE generated audio for a line will be adjusted to fit within this duration.
304
- If there is no subsequent timestamp, the speed adjustment will be skipped.
305
- You can control the intensity of the speed adjustment using the "Speed Adjustment Factor" slider.
306
- Format: `HH:MM:SS,milliseconds "VoicePrefix Text" more text "AnotherVoicePrefix More Text"`
307
  Example:
308
  ```
309
- 00:00:00,000 "This is the default voice." more default. "1F Now a female voice." and back to default.
310
- 00:00:05,500 "1C Yes," said the child, "it is fun!"
311
  ```
312
  ***************************************************************************************************
313
  1M = en-AU-WilliamNeural - en-AU (Male)
@@ -329,17 +271,16 @@ async def create_demo():
329
  demo = gr.Interface(
330
  fn=tts_interface,
331
  inputs=[
332
- gr.Textbox(label="Timestamped Text with Voice Changes and Duration", lines=10, placeholder='00:00:00,000 "Text" more text "1F Different Voice"'),
333
  gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Default Voice", value=default_voice),
334
  gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
335
- gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1),
336
- gr.Slider(minimum=0.5, maximum=1.5, value=1.0, step=0.05, label="Speed Adjustment Factor")
337
  ],
338
  outputs=[
339
  gr.Audio(label="Generated Audio", type="filepath"),
340
  gr.Markdown(label="Warning", visible=False)
341
  ],
342
- title="TTS with Line-Wide Duration Adjustment and In-Quote Voice Switching",
343
  description=description,
344
  analytics_enabled=False,
345
  allow_flagging=False
@@ -348,4 +289,4 @@ async def create_demo():
348
 
349
  if __name__ == "__main__":
350
  demo = asyncio.run(create_demo())
351
- demo.launch()
 
7
  import re
8
  from pathlib import Path
9
  from pydub import AudioSegment
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  def get_silence(duration_ms=1000):
12
  # Create silent audio segment with specified parameters
 
14
  duration=duration_ms,
15
  frame_rate=24000 # 24kHz sampling rate
16
  )
17
+
18
  # Set audio parameters
19
  silent_audio = silent_audio.set_channels(1) # Mono
20
  silent_audio = silent_audio.set_sample_width(4) # 32-bit (4 bytes per sample)
21
+
22
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
23
  # Export with specific bitrate and codec parameters
24
  silent_audio.export(
 
36
 
37
  # Get all available voices
38
  async def get_voices():
39
+ voices = await edge_tts.list_voices()
40
+ return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
 
 
 
 
41
 
42
+ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch):
43
+ """Generates audio for a text segment, handling voice prefixes."""
44
  current_voice_full = default_voice
45
  current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
46
  current_rate = rate
47
  current_pitch = pitch
48
  processed_text = text_segment.strip()
49
+ voice1_full = "en-AU-WilliamNeural - en-AU (Male)"
50
+ voice1_short = voice1_full.split(" - ")[0]
51
+ voice1F_full ="en-GB-SoniaNeural - en-GB (Female)"
52
+ voice1F_short = voice1F_full.split(" - ")[0]
53
+ voice2_full = "en-GB-RyanNeural - en-GB (Male)"
54
+ voice2_short = voice2_full.split(" - ")[0]
55
+ voice2F_full = "en-US-JennyNeural - en-US (Female)"
56
+ voice2F_short = voice2F_full.split(" - ")[0]
57
+ voice3_full ="en-US-BrianMultilingualNeural - en-US (Male)" #good for reading
58
+ voice3_short = voice3_full.split(" - ")[0]
59
+ voice3F_full = "en-HK-YanNeural - en-HK (Female)"
60
+ voice3F_short = voice3F_full.split(" - ")[0]
61
+ voice4_full = "en-GB-ThomasNeural - en-GB (Male)"
62
+ voice4_short = voice4_full.split(" - ")[0]
63
+ voice4F_full ="en-US-EmmaNeural - en-US (Female)"
64
+ voice4F_short = voice4F_full.split(" - ")[0]
65
+ voice5_full = "en-GB-RyanNeural - en-GB (Male)" #Old Man
66
+ voice5_short = voice5_full.split(" - ")[0]
67
+ voice6_full = "en-GB-MaisieNeural - en-GB (Female)" #Child
68
+ voice6_short = voice6_full.split(" - ")[0]
69
+ voice7_full = "vi-VN-HoaiMyNeural - vi-VN (Female)" #Vietnamese
70
+ voice7_short = voice7_full.split(" - ")[0]
71
+ voice8_full = "vi-VN-NamMinhNeural - vi-VN (Male)" #Vietnamese
72
+ voice8_short = voice8_full.split(" - ")[0]
73
+ voice9F_full = "de-DE-SeraphinaMultilingualNeural - de-DE (Female)" #Vietnamese
74
+ voice9F_short = voice7_full.split(" - ")[0]
75
+ voice9_full = "ko-KR-HyunsuMultilingualNeural - ko-KR (Male)" #Vietnamese
76
+ voice9_short = voice8_full.split(" - ")[0]
77
+ detect=0
78
+ if processed_text.startswith("1F"):
79
+ current_voice_short = voice1F_short
80
+ current_pitch = 25
81
+ detect=1
82
+ #processed_text = processed_text[2:].strip()
83
+ elif processed_text.startswith("2F"):
84
+ current_voice_short = voice2F_short
85
+ #processed_text = processed_text[2:].strip()
86
+ detect=1
87
+ elif processed_text.startswith("3F"):
88
+ current_voice_short = voice3F_short
89
+ #processed_text = processed_text[2:].strip()
90
+ detect=1
91
+ elif processed_text.startswith("4F"):
92
+ current_voice_short = voice4F_short
93
+ #processed_text = processed_text[2:].strip()
94
+ detect=1
95
+ elif processed_text.startswith("1M"):
96
+ current_voice_short = voice1_short
97
+ #processed_text = processed_text[2:].strip()
98
+ detect=1
99
+ elif processed_text.startswith("2M"):
100
+ current_voice_short = voice2_short
101
+ #processed_text = processed_text[2:].strip()
102
+ detect=1
103
+ elif processed_text.startswith("3M"):
104
+ current_voice_short = voice3_short
105
+ #processed_text = processed_text[2:].strip()
106
+ detect=1
107
+ elif processed_text.startswith("4M"):
108
+ current_voice_short = voice4_short
109
+ #processed_text = processed_text[2:].strip()
110
+ detect=1
111
+ elif processed_text.startswith("1O"): # Old man voice
112
+ current_voice_short = voice5_short
113
+ current_pitch = -20
114
+ current_rate = -10
115
+ #processed_text = processed_text[2:].strip()
116
+ detect=1
117
+ elif processed_text.startswith("1C"): #Child voice
118
+ current_voice_short = voice6_short
119
+ #processed_text = processed_text[2:].strip()
120
+ detect=1
121
+ elif processed_text.startswith("1V"): #Female VN
122
+ current_voice_short = voice7_short
123
+ #processed_text = processed_text[2:].strip()
124
+ detect=1
125
+ elif processed_text.startswith("2V"):
126
+ current_voice_short = voice8_short
127
+ #processed_text = processed_text[2:].strip()
128
+ detect=1
129
+ elif processed_text.startswith("3V"): #Female VN
130
+ current_voice_short = voice9F_short
131
+ current_pitch = 25
132
+ #processed_text = processed_text[2:].strip()
133
+ detect=1
134
+ elif processed_text.startswith("4V"):
135
+ current_voice_short = voice9_short
136
+ current_pitch = -20
137
+ #processed_text = processed_text[2:].strip()
138
+ detect=1
139
+ #Looking for number following prefix, which are pitch values.
140
+ #match = re.search(r'[A-Za-z]\d+', part) # Look for a letter followed by one or more digits
141
+ match = re.search(r'[A-Za-z]+\-?\d+', processed_text) # Look for a letter(s) followed by an optional '-' and digits
142
  if match:
143
+ # Extract the prefix (e.g., '2F') and number (e.g., '-20')
144
+ prefix = ''.join([ch for ch in match.group() if ch.isalpha()]) # Extract letters (prefix)
145
+ number = int(''.join([ch for ch in match.group() if ch.isdigit() or ch == '-'])) # Extract digits (number)
146
+ current_pitch += number
147
+ # Step 2: Remove the found number from the string
148
+ new_text = re.sub(r'[A-Za-z]+\-?\d+', '', processed_text, count=1).strip() # Remove prefix and number (e.g., '2F-20')
149
+ #processed_text = new_text[2:] #cut out the prefix like 1F, 3M etc
150
+ processed_text = new_text[len(prefix):] # Dynamically remove the prefix part
151
+ else:
152
+ if detect:
153
+ processed_text = processed_text[2:]
154
  if processed_text:
155
  rate_str = f"{current_rate:+d}%"
156
  pitch_str = f"{current_pitch:+d}Hz"
157
+ communicate = edge_tts.Communicate(processed_text, current_voice_short, rate=rate_str, pitch=pitch_str)
158
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
159
+ audio_path = tmp_file.name
160
+ await communicate.save(audio_path)
161
+ return audio_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  return None
163
 
164
+ async def process_transcript_line(line, default_voice, rate, pitch):
165
+ """Processes a single transcript line with HH:MM:SS.milliseconds timestamp and quoted text segments."""
166
+ match = re.match(r'(\d{2}):(\d{2}):(\d{2})\.(\d{3})\s+(.*)', line)
167
  if match:
168
+ hours, minutes, seconds, milliseconds, text_parts = match.groups()
169
  start_time_ms = (
170
+ int(hours) * 3600000 +
171
+ int(minutes) * 60000 +
172
+ int(seconds) * 1000 +
173
+ int(milliseconds)
174
  )
175
  audio_segments = []
176
+ split_parts = re.split(r'(")', text_parts) # Split by quote marks, keeping the quotes
177
+
178
  process_next = False
179
  for part in split_parts:
180
  if part == '"':
181
  process_next = not process_next
182
  continue
183
  if process_next and part.strip():
184
+ audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch)
185
  if audio_path:
186
  audio_segments.append(audio_path)
187
  elif not process_next and part.strip():
188
+ audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch) # Process unquoted text with default voice
189
  if audio_path:
190
  audio_segments.append(audio_path)
191
 
192
+ return start_time_ms, audio_segments
193
+ return None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
+ async def transcript_to_speech(transcript_text, voice, rate, pitch):
 
 
196
  if not transcript_text.strip():
197
  return None, gr.Warning("Please enter transcript text.")
198
  if not voice:
199
  return None, gr.Warning("Please select a voice.")
200
+
201
  lines = transcript_text.strip().split('\n')
202
  timed_audio_segments = []
203
  max_end_time_ms = 0
204
 
205
+ for line in lines:
206
+ start_time, audio_paths = await process_transcript_line(line, voice, rate, pitch)
207
+ if start_time is not None and audio_paths:
208
+ combined_line_audio = AudioSegment.empty()
209
+ for path in audio_paths:
210
+ try:
211
+ audio = AudioSegment.from_mp3(path)
212
+ combined_line_audio += audio
213
+ os.remove(path)
214
+ except FileNotFoundError:
215
+ print(f"Warning: Audio file not found: {path}")
216
+
217
+ if combined_line_audio:
218
+ timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
219
+ max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
220
+ elif audio_paths:
221
+ for path in audio_paths:
222
+ try:
223
+ os.remove(path)
224
+ except FileNotFoundError:
225
+ pass # Clean up even if no timestamp
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
  if not timed_audio_segments:
228
  return None, "No processable audio segments found."
 
230
  final_audio = AudioSegment.silent(duration=max_end_time_ms, frame_rate=24000)
231
  for segment in timed_audio_segments:
232
  final_audio = final_audio.overlay(segment['audio'], position=segment['start'])
 
 
 
 
233
 
234
  combined_audio_path = tempfile.mktemp(suffix=".mp3")
235
+ final_audio.export(combined_audio_path, format="mp3")
 
236
  return combined_audio_path, None
237
 
238
  @spaces.GPU
239
+ def tts_interface(transcript, voice, rate, pitch):
240
+ audio, warning = asyncio.run(transcript_to_speech(transcript, voice, rate, pitch))
241
  return audio, warning
242
 
243
  async def create_demo():
244
  voices = await get_voices()
245
  default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
246
  description = """
247
+ Process timestamped text (HH:MM:SS.milliseconds) with voice changes within quotes.
248
+ Format: `HH:MM:SS.milliseconds "VoicePrefix Text" more text "AnotherVoicePrefix More Text"`
 
 
 
 
249
  Example:
250
  ```
251
+ 00:00:00.000 "This is the default voice." more default. "1F Now a female voice." and back to default.
252
+ 00:00:05.000 "1C Yes," said the child, "it is fun!"
253
  ```
254
  ***************************************************************************************************
255
  1M = en-AU-WilliamNeural - en-AU (Male)
 
271
  demo = gr.Interface(
272
  fn=tts_interface,
273
  inputs=[
274
+ gr.Textbox(label="Timestamped Text with Voice Changes", lines=10, placeholder='00:00:00.000 "Text" more text "1F Different Voice"'),
275
  gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Default Voice", value=default_voice),
276
  gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
277
+ gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
 
278
  ],
279
  outputs=[
280
  gr.Audio(label="Generated Audio", type="filepath"),
281
  gr.Markdown(label="Warning", visible=False)
282
  ],
283
+ title="TTS with HH:MM:SS.milliseconds and In-Quote Voice Switching",
284
  description=description,
285
  analytics_enabled=False,
286
  allow_flagging=False
 
289
 
290
  if __name__ == "__main__":
291
  demo = asyncio.run(create_demo())
292
+ demo.launch()