Update app.py
Browse files
app.py
CHANGED
@@ -1,129 +1,145 @@
|
|
1 |
-
import spaces
|
2 |
-
import gradio as gr
|
3 |
-
import edge_tts
|
4 |
import asyncio
|
5 |
-
import tempfile
|
6 |
import os
|
7 |
-
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
|
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
19 |
voice2 = "en-US-BrianNeural - en-US (Male)"
|
20 |
-
voice2F = "en-US-JennyNeural - en-US (Female)"
|
21 |
-
voice3 = "en-AU-WilliamNeural - en-AU (Male)"
|
22 |
voice3F = "en-HK-YanNeural - en-HK (Female)"
|
23 |
-
voice4 = "en-GB-MaisieNeural - en-GB (Female)" #Child
|
|
|
24 |
if not text.strip():
|
25 |
return None
|
|
|
|
|
|
|
26 |
if text.startswith("1F"):
|
27 |
-
text2 = text[2:]
|
28 |
-
voice_short_name =voice1F.split(" - ")[0]
|
29 |
elif text.startswith("2F"):
|
30 |
-
text2 = text[2:]
|
31 |
-
voice_short_name =voice2F.split(" - ")[0]
|
32 |
elif text.startswith("3F"):
|
33 |
-
text2 = text[2:]
|
34 |
-
voice_short_name =voice3F.split(" - ")[0]
|
35 |
elif text.startswith("1M"):
|
36 |
-
text2 = text[2:]
|
37 |
-
voice_short_name =voice2.split(" - ")[0]
|
38 |
elif text.startswith("2M"):
|
39 |
-
text2 = text[2:]
|
40 |
-
voice_short_name =voice3.split(" - ")[0]
|
41 |
elif text.startswith("1C"):
|
42 |
-
text2 = text[2:]
|
43 |
-
voice_short_name =voice4.split(" - ")[0]
|
44 |
else:
|
45 |
-
|
46 |
-
|
47 |
-
text2=text
|
48 |
rate_str = f"{rate:+d}%"
|
49 |
pitch_str = f"{pitch:+d}Hz"
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
|
|
|
|
|
|
|
|
55 |
|
56 |
-
# Main text-to-speech function
|
57 |
async def text_to_speech(text, voice, rate, pitch):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
if not text.strip():
|
59 |
return None, gr.Warning("Please enter text to convert.")
|
60 |
if not voice:
|
61 |
return None, gr.Warning("Please select a voice.")
|
62 |
|
63 |
-
# Split by two or more newline characters, optionally preceded by carriage returns
|
64 |
-
paragraphs = [p for p in re.split(r'\r?\n\r?\n+', text) if p.strip()]
|
65 |
-
|
66 |
audio_files = []
|
67 |
-
|
68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
if audio_path:
|
70 |
audio_files.append(audio_path)
|
71 |
|
72 |
if not audio_files:
|
73 |
-
return None, None
|
74 |
|
75 |
-
# Combine audio files if there are multiple paragraphs
|
76 |
if len(audio_files) == 1:
|
77 |
return audio_files[0], None
|
78 |
else:
|
79 |
-
#
|
80 |
combined_audio_path = tempfile.mktemp(suffix=".mp3")
|
81 |
with open(combined_audio_path, 'wb') as outfile:
|
82 |
for filename in audio_files:
|
83 |
-
|
84 |
-
|
85 |
-
|
|
|
|
|
|
|
|
|
86 |
return combined_audio_path, None
|
87 |
-
|
88 |
-
# Gradio interface function
|
89 |
-
@spaces.GPU
|
90 |
-
def tts_interface(text, voice, rate, pitch):
|
91 |
-
audio, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))
|
92 |
-
return audio, warning
|
93 |
-
|
94 |
-
# Create Gradio application
|
95 |
-
import gradio as gr
|
96 |
-
|
97 |
-
async def create_demo():
|
98 |
-
voices = await get_voices()
|
99 |
-
default_voice = "en-US-AndrewNeural - en-US (Male)" # 👈 Pick one of the available voices
|
100 |
-
description = """
|
101 |
-
Default = male, other voices 1F:US_Emma, 2F:US_Jenny, 3F:HK_Jan, 1M:US_Guy, 2M:AU_William, 1C: Childvoice
|
102 |
-
Enter your text, select a voice, and adjust the speech rate and pitch.
|
103 |
-
The application will process your text paragraph by paragraph (separated by two blank lines).
|
104 |
-
"""
|
105 |
-
|
106 |
-
demo = gr.Interface(
|
107 |
-
fn=tts_interface,
|
108 |
-
inputs=[
|
109 |
-
gr.Textbox(label="Input Text", lines=5, placeholder="Separate paragraphs with two blank lines."),
|
110 |
-
gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice),
|
111 |
-
gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
|
112 |
-
gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1)
|
113 |
-
],
|
114 |
-
outputs=[
|
115 |
-
gr.Audio(label="Generated Audio", type="filepath"),
|
116 |
-
gr.Markdown(label="Warning", visible=False)
|
117 |
-
],
|
118 |
-
title="Voicecloning.be Text-to-Speech (Paragraph by Paragraph)",
|
119 |
-
description=description,
|
120 |
-
article="Process text paragraph by paragraph for smoother output.",
|
121 |
-
analytics_enabled=False,
|
122 |
-
allow_flagging=False
|
123 |
-
)
|
124 |
-
return demo
|
125 |
-
|
126 |
-
# Run the application
|
127 |
-
if __name__ == "__main__":
|
128 |
-
demo = asyncio.run(create_demo())
|
129 |
-
demo.launch()
|
|
|
|
|
|
|
|
|
1 |
import asyncio
|
|
|
2 |
import os
|
3 |
+
import re
|
4 |
+
import tempfile
|
5 |
+
import edge_tts
|
6 |
+
import gradio as gr
|
7 |
+
|
8 |
+
# Default voice
|
9 |
+
default_voice = "en-US-AndrewNeural - en-US (Male)"
|
10 |
|
11 |
+
# Text-to-speech function for a single segment
|
12 |
+
async def process_speech_segment(text, voice, rate, pitch):
|
13 |
+
"""
|
14 |
+
Processes a single segment of text (either a quote or regular text)
|
15 |
+
and generates speech using edge-tts.
|
16 |
|
17 |
+
Args:
|
18 |
+
text (str): The text to be converted to speech.
|
19 |
+
voice (str): The voice to use (can be overridden by prefixes).
|
20 |
+
rate (int): The speech rate.
|
21 |
+
pitch (int): The speech pitch.
|
22 |
|
23 |
+
Returns:
|
24 |
+
str: The path to the generated audio file, or None on error.
|
25 |
+
"""
|
26 |
+
voice1 = "en-US-AndrewNeural - en-US (Male)" # good for reading
|
27 |
+
voice1F = "en-US-EmmaNeural - en-US (Female)"
|
28 |
voice2 = "en-US-BrianNeural - en-US (Male)"
|
29 |
+
voice2F = "en-US-JennyNeural - en-US (Female)"
|
30 |
+
voice3 = "en-AU-WilliamNeural - en-AU (Male)"
|
31 |
voice3F = "en-HK-YanNeural - en-HK (Female)"
|
32 |
+
voice4 = "en-GB-MaisieNeural - en-GB (Female)" # Child
|
33 |
+
|
34 |
if not text.strip():
|
35 |
return None
|
36 |
+
|
37 |
+
voice_short_name = voice.split(" - ")[0] #default
|
38 |
+
|
39 |
if text.startswith("1F"):
|
40 |
+
text2 = text[2:].strip()
|
41 |
+
voice_short_name = voice1F.split(" - ")[0]
|
42 |
elif text.startswith("2F"):
|
43 |
+
text2 = text[2:].strip()
|
44 |
+
voice_short_name = voice2F.split(" - ")[0]
|
45 |
elif text.startswith("3F"):
|
46 |
+
text2 = text[2:].strip()
|
47 |
+
voice_short_name = voice3F.split(" - ")[0]
|
48 |
elif text.startswith("1M"):
|
49 |
+
text2 = text[2:].strip()
|
50 |
+
voice_short_name = voice2.split(" - ")[0]
|
51 |
elif text.startswith("2M"):
|
52 |
+
text2 = text[2:].strip()
|
53 |
+
voice_short_name = voice3.split(" - ")[0]
|
54 |
elif text.startswith("1C"):
|
55 |
+
text2 = text[2:].strip()
|
56 |
+
voice_short_name = voice4.split(" - ")[0]
|
57 |
else:
|
58 |
+
text2 = text
|
59 |
+
|
|
|
60 |
rate_str = f"{rate:+d}%"
|
61 |
pitch_str = f"{pitch:+d}Hz"
|
62 |
+
try:
|
63 |
+
communicate = edge_tts.Communicate(text2, voice=voice_short_name, rate=rate_str, pitch=pitch_str) #removed async
|
64 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
|
65 |
+
tmp_path = tmp_file.name
|
66 |
+
await communicate.save(tmp_path) #added await
|
67 |
+
return tmp_path
|
68 |
+
except Exception as e:
|
69 |
+
print(f"Error processing segment: {e}") # Log the error
|
70 |
+
return None
|
71 |
|
72 |
+
# Main text-to-speech function
|
73 |
async def text_to_speech(text, voice, rate, pitch):
|
74 |
+
"""
|
75 |
+
Processes the input text, identifying quoted sections for different voices,
|
76 |
+
and generates combined audio.
|
77 |
+
|
78 |
+
Args:
|
79 |
+
text (str): The input text.
|
80 |
+
voice (str): The default voice.
|
81 |
+
rate (int): The speech rate.
|
82 |
+
pitch (int): The speech pitch.
|
83 |
+
|
84 |
+
Returns:
|
85 |
+
tuple: (audio_path, error_message) where audio_path is the path to the
|
86 |
+
combined audio file, and error_message is any error encountered.
|
87 |
+
"""
|
88 |
if not text.strip():
|
89 |
return None, gr.Warning("Please enter text to convert.")
|
90 |
if not voice:
|
91 |
return None, gr.Warning("Please select a voice.")
|
92 |
|
|
|
|
|
|
|
93 |
audio_files = []
|
94 |
+
segments = []
|
95 |
+
i = 0
|
96 |
+
while i < len(text):
|
97 |
+
if text[i] == '"':
|
98 |
+
# Find the closing quote
|
99 |
+
j = i + 1
|
100 |
+
while j < len(text) and text[j] != '"':
|
101 |
+
j += 1
|
102 |
+
if j < len(text):
|
103 |
+
segments.append(("quote", text[i + 1:j]))
|
104 |
+
i = j + 1
|
105 |
+
else:
|
106 |
+
segments.append(("text", text[i:])) # Handle unclosed quote
|
107 |
+
i = j
|
108 |
+
else:
|
109 |
+
# Find the end of the non-quote text
|
110 |
+
j = i + 1
|
111 |
+
while j < len(text) and text[j] != '"':
|
112 |
+
j += 1
|
113 |
+
segments.append(("text", text[i:j]))
|
114 |
+
i = j
|
115 |
+
|
116 |
+
for segment_type, segment_text in segments:
|
117 |
+
if segment_type == "quote":
|
118 |
+
# Determine the voice based on the prefix within the quote.
|
119 |
+
voice_prefix = ""
|
120 |
+
if segment_text.startswith("1F") or segment_text.startswith("2F") or segment_text.startswith("3F") or segment_text.startswith("1M") or segment_text.startswith("2M") or segment_text.startswith("1C"):
|
121 |
+
voice_prefix = segment_text[:2]
|
122 |
+
audio_path = await process_speech_segment(segment_text, voice, rate, pitch)
|
123 |
+
else:
|
124 |
+
audio_path = await process_speech_segment(segment_text, voice, rate, pitch)
|
125 |
if audio_path:
|
126 |
audio_files.append(audio_path)
|
127 |
|
128 |
if not audio_files:
|
129 |
+
return None, None
|
130 |
|
|
|
131 |
if len(audio_files) == 1:
|
132 |
return audio_files[0], None
|
133 |
else:
|
134 |
+
# Combine audio files
|
135 |
combined_audio_path = tempfile.mktemp(suffix=".mp3")
|
136 |
with open(combined_audio_path, 'wb') as outfile:
|
137 |
for filename in audio_files:
|
138 |
+
try:
|
139 |
+
with open(filename, 'rb') as infile:
|
140 |
+
outfile.write(infile.read())
|
141 |
+
os.remove(filename) # Clean up individual files
|
142 |
+
except Exception as e:
|
143 |
+
print(f"Error combining audio files: {e}")
|
144 |
+
return None, gr.Error(f"Error combining audio files: {e}")
|
145 |
return combined_audio_path, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|