Update app.py
Browse files
app.py
CHANGED
@@ -47,7 +47,7 @@ async def get_voices():
|
|
47 |
print(f"Error listing voices: {e}")
|
48 |
return {}
|
49 |
|
50 |
-
async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch,
|
51 |
"""Generates audio for a text segment, handling voice prefixes and adjusting rate for duration."""
|
52 |
current_voice_full = default_voice
|
53 |
current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
|
@@ -102,29 +102,18 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
|
|
102 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
|
103 |
audio_path = tmp_file.name
|
104 |
await communicate.save(audio_path)
|
105 |
-
if
|
106 |
audio = AudioSegment.from_mp3(audio_path)
|
107 |
audio_duration_ms = len(audio)
|
108 |
-
#
|
109 |
|
110 |
-
if audio_duration_ms > target_duration_ms:
|
111 |
-
speed_factor = (audio_duration_ms / target_duration_ms) * speed_adjustment_factor
|
112 |
-
#print(f"Speed factor (after user adjustment): {speed_factor}") # Debug
|
113 |
-
if speed_factor > 0:
|
114 |
-
if speed_factor < 1.0:
|
115 |
-
speed_factor = 1.0
|
116 |
-
audio = AudioSegment.from_file(audio_path)
|
117 |
-
audio_stretched = audio.speedup(playback_speed=speed_factor)
|
118 |
-
audio_stretched.export(audio_path, format="mp3")
|
119 |
-
else:
|
120 |
-
print("Generated audio is not longer than target duration, no speed adjustment.") # Debug
|
121 |
return audio_path
|
122 |
except Exception as e:
|
123 |
print(f"Edge TTS error processing '{processed_text}': {e}")
|
124 |
return None
|
125 |
return None
|
126 |
|
127 |
-
async def process_transcript_line(line, next_line_start_time, default_voice, rate, pitch, speed_adjustment_factor):
|
128 |
"""Processes a single transcript line with HH:MM:SS,milliseconds timestamp."""
|
129 |
match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+(.*)', line)
|
130 |
if match:
|
@@ -135,9 +124,6 @@ async def process_transcript_line(line, next_line_start_time, default_voice, rat
|
|
135 |
int(start_s) * 1000 +
|
136 |
int(start_ms)
|
137 |
)
|
138 |
-
duration_ms = None
|
139 |
-
if next_line_start_time is not None:
|
140 |
-
duration_ms = next_line_start_time - start_time_ms
|
141 |
|
142 |
audio_segments = []
|
143 |
split_parts = re.split(r'[“”"]', text_parts)
|
@@ -147,14 +133,14 @@ async def process_transcript_line(line, next_line_start_time, default_voice, rat
|
|
147 |
process_next = not process_next
|
148 |
continue
|
149 |
if process_next and part.strip():
|
150 |
-
audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch,
|
151 |
if audio_path:
|
152 |
audio_segments.append(audio_path)
|
153 |
elif not process_next and part.strip():
|
154 |
-
audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch,
|
155 |
if audio_path:
|
156 |
audio_segments.append(audio_path)
|
157 |
-
return start_time_ms, audio_segments,
|
158 |
return None, None, None
|
159 |
|
160 |
async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjustment_factor):
|
@@ -165,6 +151,7 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjust
|
|
165 |
lines = transcript_text.strip().split('\n')
|
166 |
timed_audio_segments = []
|
167 |
max_end_time_ms = 0
|
|
|
168 |
for i, line in enumerate(lines):
|
169 |
next_line_start_time = None
|
170 |
if i < len(lines) - 1:
|
@@ -178,27 +165,52 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjust
|
|
178 |
int(nms)
|
179 |
)
|
180 |
|
181 |
-
|
182 |
-
if
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
|
203 |
if not timed_audio_segments:
|
204 |
return None, "No processable audio segments found."
|
@@ -221,8 +233,8 @@ async def create_demo():
|
|
221 |
default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
|
222 |
description = """
|
223 |
Process timestamped text (HH:MM:SS,milliseconds) with voice changes within quotes.
|
224 |
-
The duration for each
|
225 |
-
The speed of the generated audio will be adjusted to fit within this duration.
|
226 |
If there is no subsequent timestamp, the speed adjustment will be skipped.
|
227 |
You can control the intensity of the speed adjustment using the "Speed Adjustment Factor" slider.
|
228 |
Format: `HH:MM:SS,milliseconds "VoicePrefix Text" more text "AnotherVoicePrefix More Text"`
|
@@ -261,7 +273,7 @@ async def create_demo():
|
|
261 |
gr.Audio(label="Generated Audio", type="filepath"),
|
262 |
gr.Markdown(label="Warning", visible=False)
|
263 |
],
|
264 |
-
title="TTS with
|
265 |
description=description,
|
266 |
analytics_enabled=False,
|
267 |
allow_flagging=False
|
|
|
47 |
print(f"Error listing voices: {e}")
|
48 |
return {}
|
49 |
|
50 |
+
async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch, overall_target_duration_ms=None, speed_adjustment_factor=1.0):
|
51 |
"""Generates audio for a text segment, handling voice prefixes and adjusting rate for duration."""
|
52 |
current_voice_full = default_voice
|
53 |
current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
|
|
|
102 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
|
103 |
audio_path = tmp_file.name
|
104 |
await communicate.save(audio_path)
|
105 |
+
if overall_target_duration_ms is not None and os.path.exists(audio_path) and overall_target_duration_ms > 0:
|
106 |
audio = AudioSegment.from_mp3(audio_path)
|
107 |
audio_duration_ms = len(audio)
|
108 |
+
# We don't do the stretching here anymore for individual segments
|
109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
return audio_path
|
111 |
except Exception as e:
|
112 |
print(f"Edge TTS error processing '{processed_text}': {e}")
|
113 |
return None
|
114 |
return None
|
115 |
|
116 |
+
async def process_transcript_line(line, next_line_start_time, default_voice, rate, pitch, overall_duration_ms, speed_adjustment_factor):
|
117 |
"""Processes a single transcript line with HH:MM:SS,milliseconds timestamp."""
|
118 |
match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+(.*)', line)
|
119 |
if match:
|
|
|
124 |
int(start_s) * 1000 +
|
125 |
int(start_ms)
|
126 |
)
|
|
|
|
|
|
|
127 |
|
128 |
audio_segments = []
|
129 |
split_parts = re.split(r'[“”"]', text_parts)
|
|
|
133 |
process_next = not process_next
|
134 |
continue
|
135 |
if process_next and part.strip():
|
136 |
+
audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, overall_duration_ms, speed_adjustment_factor)
|
137 |
if audio_path:
|
138 |
audio_segments.append(audio_path)
|
139 |
elif not process_next and part.strip():
|
140 |
+
audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, overall_duration_ms, speed_adjustment_factor)
|
141 |
if audio_path:
|
142 |
audio_segments.append(audio_path)
|
143 |
+
return start_time_ms, audio_segments, overall_duration_ms
|
144 |
return None, None, None
|
145 |
|
146 |
async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjustment_factor):
|
|
|
151 |
lines = transcript_text.strip().split('\n')
|
152 |
timed_audio_segments = []
|
153 |
max_end_time_ms = 0
|
154 |
+
|
155 |
for i, line in enumerate(lines):
|
156 |
next_line_start_time = None
|
157 |
if i < len(lines) - 1:
|
|
|
165 |
int(nms)
|
166 |
)
|
167 |
|
168 |
+
current_line_match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+(.*)', line)
|
169 |
+
if current_line_match:
|
170 |
+
sh, sm, ss, sms, text_content = current_line_match.groups()
|
171 |
+
start_time_ms = (
|
172 |
+
int(sh) * 3600000 +
|
173 |
+
int(sm) * 60000 +
|
174 |
+
int(ss) * 1000 +
|
175 |
+
int(sms)
|
176 |
+
)
|
177 |
+
overall_duration_ms = None
|
178 |
+
if next_line_start_time is not None:
|
179 |
+
overall_duration_ms = next_line_start_time - start_time_ms
|
180 |
+
|
181 |
+
start_time, audio_paths, duration = await process_transcript_line(line, next_line_start_time, voice, rate, pitch, overall_duration_ms, speed_adjustment_factor)
|
182 |
+
|
183 |
+
if start_time is not None and audio_paths:
|
184 |
+
combined_line_audio = AudioSegment.empty()
|
185 |
+
total_generated_duration_ms = 0
|
186 |
+
for path in audio_paths:
|
187 |
+
if path:
|
188 |
+
try:
|
189 |
+
audio = AudioSegment.from_mp3(path)
|
190 |
+
combined_line_audio += audio
|
191 |
+
total_generated_duration_ms += len(audio)
|
192 |
+
os.remove(path)
|
193 |
+
except FileNotFoundError:
|
194 |
+
print(f"Warning: Audio file not found: {path}")
|
195 |
+
|
196 |
+
if combined_line_audio and overall_duration_ms is not None and overall_duration_ms > 0 and total_generated_duration_ms > overall_duration_ms:
|
197 |
+
speed_factor = (total_generated_duration_ms / overall_duration_ms) * speed_adjustment_factor
|
198 |
+
if speed_factor > 0:
|
199 |
+
if speed_factor < 1.0:
|
200 |
+
speed_factor = 1.0
|
201 |
+
combined_line_audio = combined_line_audio.speedup(playback_speed=speed_factor)
|
202 |
+
|
203 |
+
if combined_line_audio:
|
204 |
+
timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
|
205 |
+
max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
|
206 |
+
|
207 |
+
elif audio_paths:
|
208 |
+
for path in audio_paths:
|
209 |
+
if path:
|
210 |
+
try:
|
211 |
+
os.remove(path)
|
212 |
+
except FileNotFoundError:
|
213 |
+
pass # Clean up even if no timestamp
|
214 |
|
215 |
if not timed_audio_segments:
|
216 |
return None, "No processable audio segments found."
|
|
|
233 |
default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
|
234 |
description = """
|
235 |
Process timestamped text (HH:MM:SS,milliseconds) with voice changes within quotes.
|
236 |
+
The duration for each line is determined by the timestamp of the following line.
|
237 |
+
The speed of the ENTIRE generated audio for a line will be adjusted to fit within this duration.
|
238 |
If there is no subsequent timestamp, the speed adjustment will be skipped.
|
239 |
You can control the intensity of the speed adjustment using the "Speed Adjustment Factor" slider.
|
240 |
Format: `HH:MM:SS,milliseconds "VoicePrefix Text" more text "AnotherVoicePrefix More Text"`
|
|
|
273 |
gr.Audio(label="Generated Audio", type="filepath"),
|
274 |
gr.Markdown(label="Warning", visible=False)
|
275 |
],
|
276 |
+
title="TTS with Line-Wide Duration Adjustment and In-Quote Voice Switching",
|
277 |
description=description,
|
278 |
analytics_enabled=False,
|
279 |
allow_flagging=False
|