Update app.py
Browse files
app.py
CHANGED
@@ -8,6 +8,7 @@ import re
|
|
8 |
from pathlib import Path
|
9 |
from pydub import AudioSegment
|
10 |
import librosa
|
|
|
11 |
import numpy as np
|
12 |
|
13 |
def get_silence(duration_ms=1000):
|
@@ -41,7 +42,7 @@ async def get_voices():
|
|
41 |
voices = await edge_tts.list_voices()
|
42 |
return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
|
43 |
|
44 |
-
async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch, target_duration_ms=None):
|
45 |
"""Generates audio for a text segment, handling voice prefixes and adjusting rate for duration."""
|
46 |
current_voice_full = default_voice
|
47 |
current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
|
@@ -104,10 +105,9 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
|
|
104 |
print(f"Generated audio duration: {audio_duration_ms}ms, Target duration: {target_duration_ms}ms") # Debug
|
105 |
|
106 |
if audio_duration_ms > target_duration_ms and target_duration_ms > 0:
|
107 |
-
speed_factor = (audio_duration_ms / target_duration_ms)
|
108 |
-
print(f"Speed factor (
|
109 |
if speed_factor > 0:
|
110 |
-
speed_factor = speed_factor * 0.8 # Reduce the speed adjustment
|
111 |
if speed_factor <1.0:
|
112 |
speed_factor = 1.0
|
113 |
y, sr = librosa.load(audio_path, sr=None)
|
@@ -119,7 +119,7 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
|
|
119 |
return audio_path
|
120 |
return None
|
121 |
|
122 |
-
async def process_transcript_line(line, default_voice, rate, pitch):
|
123 |
"""Processes a single transcript line with HH:MM:SS,milliseconds - HH:MM:SS,milliseconds timestamp."""
|
124 |
match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+-\s+(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+(.*)', line)
|
125 |
if match:
|
@@ -145,18 +145,18 @@ async def process_transcript_line(line, default_voice, rate, pitch):
|
|
145 |
process_next = not process_next
|
146 |
continue
|
147 |
if process_next and part.strip():
|
148 |
-
audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, duration_ms)
|
149 |
if audio_path:
|
150 |
audio_segments.append(audio_path)
|
151 |
elif not process_next and part.strip():
|
152 |
-
audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, duration_ms)
|
153 |
if audio_path:
|
154 |
audio_segments.append(audio_path)
|
155 |
|
156 |
return start_time_ms, audio_segments, duration_ms
|
157 |
return None, None, None
|
158 |
|
159 |
-
async def transcript_to_speech(transcript_text, voice, rate, pitch):
|
160 |
if not transcript_text.strip():
|
161 |
return None, gr.Warning("Please enter transcript text.")
|
162 |
if not voice:
|
@@ -167,7 +167,7 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch):
|
|
167 |
max_end_time_ms = 0
|
168 |
|
169 |
for line in lines:
|
170 |
-
start_time, audio_paths, duration = await process_transcript_line(line, voice, rate, pitch)
|
171 |
if start_time is not None and audio_paths:
|
172 |
combined_line_audio = AudioSegment.empty()
|
173 |
current_time_ms = start_time
|
@@ -176,7 +176,6 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch):
|
|
176 |
for path in audio_paths:
|
177 |
try:
|
178 |
audio = AudioSegment.from_mp3(path)
|
179 |
-
# No need to adjust speed here, it's done in generate_audio_with_voice_prefix
|
180 |
combined_line_audio += audio
|
181 |
os.remove(path)
|
182 |
except FileNotFoundError:
|
@@ -204,8 +203,8 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch):
|
|
204 |
return combined_audio_path, None
|
205 |
|
206 |
@spaces.GPU
|
207 |
-
def tts_interface(transcript, voice, rate, pitch):
|
208 |
-
audio, warning = asyncio.run(transcript_to_speech(transcript, voice, rate, pitch))
|
209 |
return audio, warning
|
210 |
|
211 |
async def create_demo():
|
@@ -214,6 +213,7 @@ async def create_demo():
|
|
214 |
description = """
|
215 |
Process timestamped text (HH:MM:SS,milliseconds - HH:MM:SS,milliseconds) with voice changes within quotes.
|
216 |
The duration specified in the timestamp will be used to adjust the speech rate so the generated audio fits within that time.
|
|
|
217 |
Format: `HH:MM:SS,milliseconds - HH:MM:SS,milliseconds "VoicePrefix Text" more text "AnotherVoicePrefix More Text"`
|
218 |
Example:
|
219 |
```
|
@@ -243,7 +243,8 @@ async def create_demo():
|
|
243 |
gr.Textbox(label="Timestamped Text with Voice Changes and Duration", lines=10, placeholder='00:00:00,000 - 00:00:05,000 "Text" more text "1F Different Voice"'),
|
244 |
gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Default Voice", value=default_voice),
|
245 |
gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
|
246 |
-
gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
|
|
|
247 |
],
|
248 |
outputs=[
|
249 |
gr.Audio(label="Generated Audio", type="filepath"),
|
@@ -257,6 +258,5 @@ async def create_demo():
|
|
257 |
return demo
|
258 |
|
259 |
if __name__ == "__main__":
|
260 |
-
import soundfile as sf # Import soundfile here
|
261 |
demo = asyncio.run(create_demo())
|
262 |
-
demo.launch()
|
|
|
8 |
from pathlib import Path
|
9 |
from pydub import AudioSegment
|
10 |
import librosa
|
11 |
+
import soundfile as sf
|
12 |
import numpy as np
|
13 |
|
14 |
def get_silence(duration_ms=1000):
|
|
|
42 |
voices = await edge_tts.list_voices()
|
43 |
return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
|
44 |
|
45 |
+
async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch, target_duration_ms=None, speed_adjustment_factor=1.0):
|
46 |
"""Generates audio for a text segment, handling voice prefixes and adjusting rate for duration."""
|
47 |
current_voice_full = default_voice
|
48 |
current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
|
|
|
105 |
print(f"Generated audio duration: {audio_duration_ms}ms, Target duration: {target_duration_ms}ms") # Debug
|
106 |
|
107 |
if audio_duration_ms > target_duration_ms and target_duration_ms > 0:
|
108 |
+
speed_factor = (audio_duration_ms / target_duration_ms) * speed_adjustment_factor
|
109 |
+
print(f"Speed factor (after user adjustment): {speed_factor}") # Debug
|
110 |
if speed_factor > 0:
|
|
|
111 |
if speed_factor <1.0:
|
112 |
speed_factor = 1.0
|
113 |
y, sr = librosa.load(audio_path, sr=None)
|
|
|
119 |
return audio_path
|
120 |
return None
|
121 |
|
122 |
+
async def process_transcript_line(line, default_voice, rate, pitch, speed_adjustment_factor):
|
123 |
"""Processes a single transcript line with HH:MM:SS,milliseconds - HH:MM:SS,milliseconds timestamp."""
|
124 |
match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+-\s+(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+(.*)', line)
|
125 |
if match:
|
|
|
145 |
process_next = not process_next
|
146 |
continue
|
147 |
if process_next and part.strip():
|
148 |
+
audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, duration_ms, speed_adjustment_factor)
|
149 |
if audio_path:
|
150 |
audio_segments.append(audio_path)
|
151 |
elif not process_next and part.strip():
|
152 |
+
audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, duration_ms, speed_adjustment_factor)
|
153 |
if audio_path:
|
154 |
audio_segments.append(audio_path)
|
155 |
|
156 |
return start_time_ms, audio_segments, duration_ms
|
157 |
return None, None, None
|
158 |
|
159 |
+
async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjustment_factor):
|
160 |
if not transcript_text.strip():
|
161 |
return None, gr.Warning("Please enter transcript text.")
|
162 |
if not voice:
|
|
|
167 |
max_end_time_ms = 0
|
168 |
|
169 |
for line in lines:
|
170 |
+
start_time, audio_paths, duration = await process_transcript_line(line, voice, rate, pitch, speed_adjustment_factor)
|
171 |
if start_time is not None and audio_paths:
|
172 |
combined_line_audio = AudioSegment.empty()
|
173 |
current_time_ms = start_time
|
|
|
176 |
for path in audio_paths:
|
177 |
try:
|
178 |
audio = AudioSegment.from_mp3(path)
|
|
|
179 |
combined_line_audio += audio
|
180 |
os.remove(path)
|
181 |
except FileNotFoundError:
|
|
|
203 |
return combined_audio_path, None
|
204 |
|
205 |
@spaces.GPU
|
206 |
+
def tts_interface(transcript, voice, rate, pitch, speed_adjustment_factor):
|
207 |
+
audio, warning = asyncio.run(transcript_to_speech(transcript, voice, rate, pitch, speed_adjustment_factor))
|
208 |
return audio, warning
|
209 |
|
210 |
async def create_demo():
|
|
|
213 |
description = """
|
214 |
Process timestamped text (HH:MM:SS,milliseconds - HH:MM:SS,milliseconds) with voice changes within quotes.
|
215 |
The duration specified in the timestamp will be used to adjust the speech rate so the generated audio fits within that time.
|
216 |
+
You can control the intensity of the speed adjustment using the "Speed Adjustment Factor" slider.
|
217 |
Format: `HH:MM:SS,milliseconds - HH:MM:SS,milliseconds "VoicePrefix Text" more text "AnotherVoicePrefix More Text"`
|
218 |
Example:
|
219 |
```
|
|
|
243 |
gr.Textbox(label="Timestamped Text with Voice Changes and Duration", lines=10, placeholder='00:00:00,000 - 00:00:05,000 "Text" more text "1F Different Voice"'),
|
244 |
gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Default Voice", value=default_voice),
|
245 |
gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
|
246 |
+
gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1),
|
247 |
+
gr.Slider(minimum=0.5, maximum=1.5, value=1.0, step=0.05, label="Speed Adjustment Factor")
|
248 |
],
|
249 |
outputs=[
|
250 |
gr.Audio(label="Generated Audio", type="filepath"),
|
|
|
258 |
return demo
|
259 |
|
260 |
if __name__ == "__main__":
|
|
|
261 |
demo = asyncio.run(create_demo())
|
262 |
+
demo.launch()
|