cnph001 commited on
Commit
9921538
·
verified ·
1 Parent(s): 3253b38

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -11
app.py CHANGED
@@ -163,7 +163,7 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
163
 
164
  async def process_transcript_line(line, default_voice, rate, pitch):
165
  """Processes a single transcript line with HH:MM:SS.milliseconds timestamp and quoted text segments."""
166
- match = re.match(r'(\d{2}):(\d{2}):(\d{2})\.(\d{3})\s+(.*)', line)
167
  if match:
168
  hours, minutes, seconds, milliseconds, text_parts = match.groups()
169
  start_time_ms = (
@@ -201,8 +201,9 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch):
201
  lines = transcript_text.strip().split('\n')
202
  timed_audio_segments = []
203
  max_end_time_ms = 0
 
204
 
205
- for line in lines:
206
  start_time, audio_paths = await process_transcript_line(line, voice, rate, pitch)
207
  if start_time is not None and audio_paths:
208
  combined_line_audio = AudioSegment.empty()
@@ -215,8 +216,27 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch):
215
  print(f"Warning: Audio file not found: {path}")
216
 
217
  if combined_line_audio:
218
- timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
219
- max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  elif audio_paths:
221
  for path in audio_paths:
222
  try:
@@ -244,12 +264,12 @@ async def create_demo():
244
  voices = await get_voices()
245
  default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
246
  description = """
247
- Process timestamped text (HH:MM:SS.milliseconds) with voice changes within quotes.
248
- Format: `HH:MM:SS.milliseconds "VoicePrefix Text" more text "AnotherVoicePrefix More Text"`
249
  Example:
250
  ```
251
- 00:00:00.000 "This is the default voice." more default. "1F Now a female voice." and back to default.
252
- 00:00:05.000 "1C Yes," said the child, "it is fun!"
253
  ```
254
  ***************************************************************************************************
255
  1M = en-AU-WilliamNeural - en-AU (Male)
@@ -271,16 +291,16 @@ async def create_demo():
271
  demo = gr.Interface(
272
  fn=tts_interface,
273
  inputs=[
274
- gr.Textbox(label="Timestamped Text with Voice Changes", lines=10, placeholder='00:00:00.000 "Text" more text "1F Different Voice"'),
275
  gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Default Voice", value=default_voice),
276
  gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
277
- gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
278
  ],
279
  outputs=[
280
  gr.Audio(label="Generated Audio", type="filepath"),
281
  gr.Markdown(label="Warning", visible=False)
282
  ],
283
- title="TTS with HH:MM:SS.milliseconds and In-Quote Voice Switching",
284
  description=description,
285
  analytics_enabled=False,
286
  allow_flagging=False
 
163
 
164
  async def process_transcript_line(line, default_voice, rate, pitch):
165
  """Processes a single transcript line with HH:MM:SS.milliseconds timestamp and quoted text segments."""
166
+ match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+(.*)', line) # Modified timestamp regex
167
  if match:
168
  hours, minutes, seconds, milliseconds, text_parts = match.groups()
169
  start_time_ms = (
 
201
  lines = transcript_text.strip().split('\n')
202
  timed_audio_segments = []
203
  max_end_time_ms = 0
204
+ previous_end_time_ms = 0
205
 
206
+ for i, line in enumerate(lines):
207
  start_time, audio_paths = await process_transcript_line(line, voice, rate, pitch)
208
  if start_time is not None and audio_paths:
209
  combined_line_audio = AudioSegment.empty()
 
216
  print(f"Warning: Audio file not found: {path}")
217
 
218
  if combined_line_audio:
219
+ current_audio_duration = len(combined_line_audio)
220
+ intended_start_time = start_time
221
+
222
+ if i > 0:
223
+ previous_line_match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+.*', lines[i-1])
224
+ if previous_line_match:
225
+ prev_h, prev_m, prev_s, prev_ms, _ = previous_line_match.groups()
226
+ previous_start_time_ms = (
227
+ int(prev_h) * 3600000 +
228
+ int(prev_m) * 60000 +
229
+ int(prev_s) * 1000 +
230
+ int(prev_ms)
231
+ )
232
+ time_difference = start_time - previous_start_time_ms
233
+
234
+ if current_audio_duration > time_difference and timed_audio_segments:
235
+ intended_start_time = previous_end_time_ms # Append to the previous audio
236
+
237
+ timed_audio_segments.append({'start': intended_start_time, 'audio': combined_line_audio})
238
+ previous_end_time_ms = intended_start_time + current_audio_duration
239
+ max_end_time_ms = max(max_end_time_ms, previous_end_time_ms)
240
  elif audio_paths:
241
  for path in audio_paths:
242
  try:
 
264
  voices = await get_voices()
265
  default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
266
  description = """
267
+ Process timestamped text (HH:MM:SS,milliseconds) with voice changes within quotes.
268
+ Format: `HH:MM:SS,milliseconds "VoicePrefix Text" more text "AnotherVoicePrefix More Text"`
269
  Example:
270
  ```
271
+ 00:00:00,000 "This is the default voice." more default. "1F Now a female voice." and back to default.
272
+ 00:00:05,000 "1C Yes," said the child, "it is fun!"
273
  ```
274
  ***************************************************************************************************
275
  1M = en-AU-WilliamNeural - en-AU (Male)
 
291
  demo = gr.Interface(
292
  fn=tts_interface,
293
  inputs=[
294
+ gr.Textbox(label="Timestamped Text with Voice Changes", lines=10, placeholder='00:00:00,000 "Text" more text "1F Different Voice"'),
295
  gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Default Voice", value=default_voice),
296
  gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
297
+ gr.Slider(minimum=-50, maximum=50, value=50, value=0, label="Pitch Adjustment (Hz)", step=1)
298
  ],
299
  outputs=[
300
  gr.Audio(label="Generated Audio", type="filepath"),
301
  gr.Markdown(label="Warning", visible=False)
302
  ],
303
+ title="TTS with HH:MM:SS,milliseconds and In-Quote Voice Switching",
304
  description=description,
305
  analytics_enabled=False,
306
  allow_flagging=False