cnph001 commited on
Commit
0d2cfad
·
verified ·
1 Parent(s): e4c6d2d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -45
app.py CHANGED
@@ -47,7 +47,7 @@ async def get_voices():
47
  print(f"Error listing voices: {e}")
48
  return {}
49
 
50
- async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch, target_duration_ms=None, speed_adjustment_factor=1.0):
51
  """Generates audio for a text segment, handling voice prefixes and adjusting rate for duration."""
52
  current_voice_full = default_voice
53
  current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
@@ -102,29 +102,18 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
102
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
103
  audio_path = tmp_file.name
104
  await communicate.save(audio_path)
105
- if target_duration_ms is not None and os.path.exists(audio_path) and target_duration_ms > 0:
106
  audio = AudioSegment.from_mp3(audio_path)
107
  audio_duration_ms = len(audio)
108
- #print(f"Generated audio duration: {audio_duration_ms}ms, Target duration: {target_duration_ms}ms") # Debug
109
 
110
- if audio_duration_ms > target_duration_ms:
111
- speed_factor = (audio_duration_ms / target_duration_ms) * speed_adjustment_factor
112
- #print(f"Speed factor (after user adjustment): {speed_factor}") # Debug
113
- if speed_factor > 0:
114
- if speed_factor < 1.0:
115
- speed_factor = 1.0
116
- audio = AudioSegment.from_file(audio_path)
117
- audio_stretched = audio.speedup(playback_speed=speed_factor)
118
- audio_stretched.export(audio_path, format="mp3")
119
- else:
120
- print("Generated audio is not longer than target duration, no speed adjustment.") # Debug
121
  return audio_path
122
  except Exception as e:
123
  print(f"Edge TTS error processing '{processed_text}': {e}")
124
  return None
125
  return None
126
 
127
- async def process_transcript_line(line, next_line_start_time, default_voice, rate, pitch, speed_adjustment_factor):
128
  """Processes a single transcript line with HH:MM:SS,milliseconds timestamp."""
129
  match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+(.*)', line)
130
  if match:
@@ -135,9 +124,6 @@ async def process_transcript_line(line, next_line_start_time, default_voice, rat
135
  int(start_s) * 1000 +
136
  int(start_ms)
137
  )
138
- duration_ms = None
139
- if next_line_start_time is not None:
140
- duration_ms = next_line_start_time - start_time_ms
141
 
142
  audio_segments = []
143
  split_parts = re.split(r'[“”"]', text_parts)
@@ -147,14 +133,14 @@ async def process_transcript_line(line, next_line_start_time, default_voice, rat
147
  process_next = not process_next
148
  continue
149
  if process_next and part.strip():
150
- audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, duration_ms, speed_adjustment_factor)
151
  if audio_path:
152
  audio_segments.append(audio_path)
153
  elif not process_next and part.strip():
154
- audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, duration_ms, speed_adjustment_factor)
155
  if audio_path:
156
  audio_segments.append(audio_path)
157
- return start_time_ms, audio_segments, duration_ms
158
  return None, None, None
159
 
160
  async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjustment_factor):
@@ -165,6 +151,7 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjust
165
  lines = transcript_text.strip().split('\n')
166
  timed_audio_segments = []
167
  max_end_time_ms = 0
 
168
  for i, line in enumerate(lines):
169
  next_line_start_time = None
170
  if i < len(lines) - 1:
@@ -178,27 +165,52 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjust
178
  int(nms)
179
  )
180
 
181
- start_time, audio_paths, duration = await process_transcript_line(line, next_line_start_time, voice, rate, pitch, speed_adjustment_factor)
182
- if start_time is not None and audio_paths:
183
- combined_line_audio = AudioSegment.empty()
184
- for path in audio_paths:
185
- if path: # Only process if audio_path is not None (meaning TTS was successful)
186
- try:
187
- audio = AudioSegment.from_mp3(path)
188
- combined_line_audio += audio
189
- os.remove(path)
190
- except FileNotFoundError:
191
- print(f"Warning: Audio file not found: {path}")
192
- if combined_line_audio:
193
- timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
194
- max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
195
- elif audio_paths:
196
- for path in audio_paths:
197
- if path:
198
- try:
199
- os.remove(path)
200
- except FileNotFoundError:
201
- pass # Clean up even if no timestamp
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
  if not timed_audio_segments:
204
  return None, "No processable audio segments found."
@@ -221,8 +233,8 @@ async def create_demo():
221
  default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
222
  description = """
223
  Process timestamped text (HH:MM:SS,milliseconds) with voice changes within quotes.
224
- The duration for each segment is determined by the timestamp of the following line.
225
- The speed of the generated audio will be adjusted to fit within this duration.
226
  If there is no subsequent timestamp, the speed adjustment will be skipped.
227
  You can control the intensity of the speed adjustment using the "Speed Adjustment Factor" slider.
228
  Format: `HH:MM:SS,milliseconds "VoicePrefix Text" more text "AnotherVoicePrefix More Text"`
@@ -261,7 +273,7 @@ async def create_demo():
261
  gr.Audio(label="Generated Audio", type="filepath"),
262
  gr.Markdown(label="Warning", visible=False)
263
  ],
264
- title="TTS with Dynamic Duration and In-Quote Voice Switching",
265
  description=description,
266
  analytics_enabled=False,
267
  allow_flagging=False
 
47
  print(f"Error listing voices: {e}")
48
  return {}
49
 
50
+ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch, overall_target_duration_ms=None, speed_adjustment_factor=1.0):
51
  """Generates audio for a text segment, handling voice prefixes and adjusting rate for duration."""
52
  current_voice_full = default_voice
53
  current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
 
102
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
103
  audio_path = tmp_file.name
104
  await communicate.save(audio_path)
105
+ if overall_target_duration_ms is not None and os.path.exists(audio_path) and overall_target_duration_ms > 0:
106
  audio = AudioSegment.from_mp3(audio_path)
107
  audio_duration_ms = len(audio)
108
+ # We don't do the stretching here anymore for individual segments
109
 
 
 
 
 
 
 
 
 
 
 
 
110
  return audio_path
111
  except Exception as e:
112
  print(f"Edge TTS error processing '{processed_text}': {e}")
113
  return None
114
  return None
115
 
116
+ async def process_transcript_line(line, next_line_start_time, default_voice, rate, pitch, overall_duration_ms, speed_adjustment_factor):
117
  """Processes a single transcript line with HH:MM:SS,milliseconds timestamp."""
118
  match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+(.*)', line)
119
  if match:
 
124
  int(start_s) * 1000 +
125
  int(start_ms)
126
  )
 
 
 
127
 
128
  audio_segments = []
129
  split_parts = re.split(r'[“”"]', text_parts)
 
133
  process_next = not process_next
134
  continue
135
  if process_next and part.strip():
136
+ audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, overall_duration_ms, speed_adjustment_factor)
137
  if audio_path:
138
  audio_segments.append(audio_path)
139
  elif not process_next and part.strip():
140
+ audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, overall_duration_ms, speed_adjustment_factor)
141
  if audio_path:
142
  audio_segments.append(audio_path)
143
+ return start_time_ms, audio_segments, overall_duration_ms
144
  return None, None, None
145
 
146
  async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjustment_factor):
 
151
  lines = transcript_text.strip().split('\n')
152
  timed_audio_segments = []
153
  max_end_time_ms = 0
154
+
155
  for i, line in enumerate(lines):
156
  next_line_start_time = None
157
  if i < len(lines) - 1:
 
165
  int(nms)
166
  )
167
 
168
+ current_line_match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+(.*)', line)
169
+ if current_line_match:
170
+ sh, sm, ss, sms, text_content = current_line_match.groups()
171
+ start_time_ms = (
172
+ int(sh) * 3600000 +
173
+ int(sm) * 60000 +
174
+ int(ss) * 1000 +
175
+ int(sms)
176
+ )
177
+ overall_duration_ms = None
178
+ if next_line_start_time is not None:
179
+ overall_duration_ms = next_line_start_time - start_time_ms
180
+
181
+ start_time, audio_paths, duration = await process_transcript_line(line, next_line_start_time, voice, rate, pitch, overall_duration_ms, speed_adjustment_factor)
182
+
183
+ if start_time is not None and audio_paths:
184
+ combined_line_audio = AudioSegment.empty()
185
+ total_generated_duration_ms = 0
186
+ for path in audio_paths:
187
+ if path:
188
+ try:
189
+ audio = AudioSegment.from_mp3(path)
190
+ combined_line_audio += audio
191
+ total_generated_duration_ms += len(audio)
192
+ os.remove(path)
193
+ except FileNotFoundError:
194
+ print(f"Warning: Audio file not found: {path}")
195
+
196
+ if combined_line_audio and overall_duration_ms is not None and overall_duration_ms > 0 and total_generated_duration_ms > overall_duration_ms:
197
+ speed_factor = (total_generated_duration_ms / overall_duration_ms) * speed_adjustment_factor
198
+ if speed_factor > 0:
199
+ if speed_factor < 1.0:
200
+ speed_factor = 1.0
201
+ combined_line_audio = combined_line_audio.speedup(playback_speed=speed_factor)
202
+
203
+ if combined_line_audio:
204
+ timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
205
+ max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
206
+
207
+ elif audio_paths:
208
+ for path in audio_paths:
209
+ if path:
210
+ try:
211
+ os.remove(path)
212
+ except FileNotFoundError:
213
+ pass # Clean up even if no timestamp
214
 
215
  if not timed_audio_segments:
216
  return None, "No processable audio segments found."
 
233
  default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
234
  description = """
235
  Process timestamped text (HH:MM:SS,milliseconds) with voice changes within quotes.
236
+ The duration for each line is determined by the timestamp of the following line.
237
+ The speed of the ENTIRE generated audio for a line will be adjusted to fit within this duration.
238
  If there is no subsequent timestamp, the speed adjustment will be skipped.
239
  You can control the intensity of the speed adjustment using the "Speed Adjustment Factor" slider.
240
  Format: `HH:MM:SS,milliseconds "VoicePrefix Text" more text "AnotherVoicePrefix More Text"`
 
273
  gr.Audio(label="Generated Audio", type="filepath"),
274
  gr.Markdown(label="Warning", visible=False)
275
  ],
276
+ title="TTS with Line-Wide Duration Adjustment and In-Quote Voice Switching",
277
  description=description,
278
  analytics_enabled=False,
279
  allow_flagging=False