cnph001 commited on
Commit
552e1db
·
verified ·
1 Parent(s): f5bf582

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -49
app.py CHANGED
@@ -4,22 +4,24 @@ import edge_tts
4
  import asyncio
5
  import tempfile
6
  import os
7
- import re
8
  import struct
9
  import wave
10
 
11
- # Define the get_voices function first
12
- async def get_voices():
13
- voices_list = await edge_tts.list_voices()
14
- voices_dict = {v["ShortName"]: f"{v['Name']} - {v['LocaleName']} ({v['Gender']})" for v in voices_list}
15
- return voices_dict
16
-
17
  # Function to create a temporary silent WAV file
18
  def create_silent_wav(duration, temp_dir, sample_rate=44100, num_channels=1, sample_width=2):
19
- """Creates a temporary WAV file containing silence."""
20
- if duration <= 0:
21
- raise ValueError("Duration must be positive.")
22
-
 
 
 
 
 
 
 
 
23
  num_frames = int(duration * sample_rate)
24
  silent_data = b'\x00' * (num_frames * num_channels * sample_width)
25
 
@@ -31,18 +33,16 @@ def create_silent_wav(duration, temp_dir, sample_rate=44100, num_channels=1, sam
31
  wf.writeframes(silent_data)
32
  return temp_wav_path
33
 
34
- # Function to process text and generate audio for a single paragraph
35
  async def paragraph_to_speech(text, voice, rate, pitch):
36
- voices = {
37
- "voice1F": "en-US-EmmaNeural - en-US (Female)",
38
- "voice2F": "en-US-JennyNeural - en-US (Female)",
39
- "voice3F": "en-HK-YanNeural - en-HK (Female)",
40
- "voice1": "en-AU-WilliamNeural - en-AU (Male)",
41
- "voice2": "it-IT-GiuseppeMultilingualNeural - it-IT (Male)",
42
- "voice3": "en-US-BrianMultilingualNeural - en-US (Male)",
43
- "voice4": "en-GB-MaisieNeural - en-GB (Female)", # Child
44
- "voice5": "en-GB-RyanNeural - en-GB (Male)" # Old Man
45
- }
46
 
47
  if not text.strip():
48
  return None, [] # Return None for audio path and empty list for silence
@@ -55,6 +55,7 @@ async def paragraph_to_speech(text, voice, rate, pitch):
55
  if re.match(r'SS\d+\.?\d*', part):
56
  try:
57
  silence_duration = float(part[2:])
 
58
  silent_wav_path = create_silent_wav(silence_duration, temp_dir)
59
  audio_segments.append(silent_wav_path)
60
  except ValueError:
@@ -65,50 +66,46 @@ async def paragraph_to_speech(text, voice, rate, pitch):
65
  current_rate = rate
66
  current_pitch = pitch
67
 
68
- # Select voice based on part prefix
69
  if part.startswith("1F"):
70
  processed_text = part[2:]
71
- current_voice = voices["voice1F"]
72
  elif part.startswith("2F"):
73
  processed_text = part[2:]
74
- current_voice = voices["voice2F"]
75
  elif part.startswith("3F"):
76
  processed_text = part[2:]
77
- current_voice = voices["voice3F"]
78
  elif part.startswith("1M"):
79
  processed_text = part[2:]
80
- current_voice = voices["voice1"]
81
  elif part.startswith("2M"):
82
  processed_text = part[2:]
83
- current_voice = voices["voice2"]
84
  elif part.startswith("3M"):
85
  processed_text = part[2:]
86
- current_voice = voices["voice3"]
87
  elif part.startswith("1C"):
88
  processed_text = part[2:]
89
- current_voice = voices["voice4"]
90
  elif part.startswith("1O"):
91
  processed_text = part[2:]
92
- current_voice = voices["voice5"]
93
  current_pitch = -30
94
  current_rate = -20
95
  else:
96
- current_voice = (voice or voices["voice1"]).split(" - ")[0]
97
- processed_text = part[:]
98
-
99
  rate_str = f"{current_rate:+d}%"
100
  pitch_str = f"{current_pitch:+d}Hz"
101
  communicate = edge_tts.Communicate(processed_text, current_voice, rate=rate_str, pitch=pitch_str)
102
-
103
- # Save speech output to temporary file
104
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
105
  tmp_path = tmp_file.name
106
  await communicate.save(tmp_path)
107
  audio_segments.append(tmp_path)
108
  else:
109
- audio_segments.append(None) # Empty string
110
 
111
- return audio_segments, [] # Returning empty list for silence times as we are directly creating silent WAV
112
 
113
  # Main text-to-speech function that processes paragraphs and silence
114
  async def text_to_speech(text, voice, rate, pitch):
@@ -117,7 +114,7 @@ async def text_to_speech(text, voice, rate, pitch):
117
  if not voice:
118
  return None, gr.Warning("Please select a voice.")
119
 
120
- paragraphs = [p.strip() for p in re.split(r'\n\n+', text) if p.strip()]
121
  final_audio_segments = []
122
 
123
  for paragraph in paragraphs:
@@ -170,14 +167,21 @@ async def text_to_speech(text, voice, rate, pitch):
170
 
171
  return combined_audio_path, None
172
 
173
- # Gradio interface function (wrapper to run async code)
174
- def tts_interface_sync(text, voice, rate, pitch):
175
- return asyncio.run(tts_interface(text, voice, rate, pitch))
 
 
 
 
 
 
 
176
 
177
- # Gradio interface
178
  async def create_demo():
179
- voices = await get_voices() # Now this function is defined
180
- default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
181
  description = """
182
  Default = male, other voices 1F:US_Emma, 2F:US_Jenny, 3F:HK_Yan, 1M:AU_Will, 2M:IT_Guiseppe,3M:US_Brian, 1C: Childvoice, 1O = OldMan
183
  You can insert silence using the marker 'SS' followed by the duration in seconds (e.g., 'SS1.2' for a 1.2-second pause).
@@ -186,14 +190,14 @@ async def create_demo():
186
  """
187
 
188
  demo = gr.Interface(
189
- fn=tts_interface_sync,
190
- inputs=[
191
  gr.Textbox(label="Input Text", lines=5, placeholder="Separate paragraphs with two blank lines. Use 'SS[duration]' for silence."),
192
  gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice),
193
  gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
194
  gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
195
  ],
196
- outputs=[
197
  gr.Audio(label="Generated Audio", type="filepath"),
198
  gr.Markdown(label="Warning", visible=False)
199
  ],
@@ -208,4 +212,4 @@ async def create_demo():
208
  # Run the application
209
  if __name__ == "__main__":
210
  demo = asyncio.run(create_demo())
211
- demo.launch()
 
4
  import asyncio
5
  import tempfile
6
  import os
7
+ import re # Import the regular expression module
8
  import struct
9
  import wave
10
 
 
 
 
 
 
 
11
  # Function to create a temporary silent WAV file
12
  def create_silent_wav(duration, temp_dir, sample_rate=44100, num_channels=1, sample_width=2):
13
+ """Creates a temporary WAV file containing silence.
14
+
15
+ Args:
16
+ duration (float): Duration of silence in seconds.
17
+ temp_dir (str): Directory to save the temporary file.
18
+ sample_rate (int): Sample rate of the audio (samples per second).
19
+ num_channels (int): Number of audio channels (1 for mono, 2 for stereo).
20
+ sample_width (int): Sample width in bytes (e.g., 2 for 16-bit).
21
+
22
+ Returns:
23
+ str: Path to the temporary silent WAV file.
24
+ """
25
  num_frames = int(duration * sample_rate)
26
  silent_data = b'\x00' * (num_frames * num_channels * sample_width)
27
 
 
33
  wf.writeframes(silent_data)
34
  return temp_wav_path
35
 
36
+ # Text-to-speech function for a single paragraph with SS handling
37
  async def paragraph_to_speech(text, voice, rate, pitch):
38
+ voice3 ="en-US-BrianMultilingualNeural - en-US (Male)" #good for reading
39
+ voice1F ="en-US-EmmaNeural - en-US (Female)"
40
+ voice2 = "it-IT-GiuseppeMultilingualNeural - it-IT (Male)"
41
+ voice2F = "en-US-JennyNeural - en-US (Female)"
42
+ voice1 = "en-AU-WilliamNeural - en-AU (Male)"
43
+ voice3F = "en-HK-YanNeural - en-HK (Female)"
44
+ voice4 = "en-GB-MaisieNeural - en-GB (Female)" #Child
45
+ voice5 = "en-GB-RyanNeural - en-GB (Male)" #Old Man
 
 
46
 
47
  if not text.strip():
48
  return None, [] # Return None for audio path and empty list for silence
 
55
  if re.match(r'SS\d+\.?\d*', part):
56
  try:
57
  silence_duration = float(part[2:])
58
+ # Assuming default WAV parameters for silence
59
  silent_wav_path = create_silent_wav(silence_duration, temp_dir)
60
  audio_segments.append(silent_wav_path)
61
  except ValueError:
 
66
  current_rate = rate
67
  current_pitch = pitch
68
 
 
69
  if part.startswith("1F"):
70
  processed_text = part[2:]
71
+ current_voice = voice1F.split(" - ")[0]
72
  elif part.startswith("2F"):
73
  processed_text = part[2:]
74
+ current_voice = voice2F.split(" - ")[0]
75
  elif part.startswith("3F"):
76
  processed_text = part[2:]
77
+ current_voice = voice3F.split(" - ")[0]
78
  elif part.startswith("1M"):
79
  processed_text = part[2:]
80
+ current_voice = voice1.split(" - ")[0]
81
  elif part.startswith("2M"):
82
  processed_text = part[2:]
83
+ current_voice = voice2.split(" - ")[0]
84
  elif part.startswith("3M"):
85
  processed_text = part[2:]
86
+ current_voice = voice3.split(" - ")[0]
87
  elif part.startswith("1C"):
88
  processed_text = part[2:]
89
+ current_voice = voice4.split(" - ")[0]
90
  elif part.startswith("1O"):
91
  processed_text = part[2:]
92
+ current_voice = voice5.split(" - ")[0]
93
  current_pitch = -30
94
  current_rate = -20
95
  else:
96
+ current_voice = (voice or default_voice).split(" - ")[0]
97
+ processed_text=part[:]
 
98
  rate_str = f"{current_rate:+d}%"
99
  pitch_str = f"{current_pitch:+d}Hz"
100
  communicate = edge_tts.Communicate(processed_text, current_voice, rate=rate_str, pitch=pitch_str)
 
 
101
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
102
  tmp_path = tmp_file.name
103
  await communicate.save(tmp_path)
104
  audio_segments.append(tmp_path)
105
  else:
106
+ audio_segments.append(None) # Empty string
107
 
108
+ return audio_segments, [] # Returning empty list for silence times as we are directly creating silent WAV
109
 
110
  # Main text-to-speech function that processes paragraphs and silence
111
  async def text_to_speech(text, voice, rate, pitch):
 
114
  if not voice:
115
  return None, gr.Warning("Please select a voice.")
116
 
117
+ paragraphs = [p.strip() for p in re.split(r'"', text) if p.strip()]
118
  final_audio_segments = []
119
 
120
  for paragraph in paragraphs:
 
167
 
168
  return combined_audio_path, None
169
 
170
+ # Gradio interface function
171
+ @spaces.GPU
172
+ def tts_interface(text, voice, rate, pitch):
173
+ audio, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))
174
+ return audio, warning
175
+
176
+ async def get_voices():
177
+ voices_list = await edge_tts.list_voices()
178
+ voices_dict = {v["ShortName"]: f"{v['Name']} - {v['LocaleName']} ({v['Gender']})" for v in voices_list}
179
+ return voices_dict
180
 
181
+ # Create Gradio application
182
  async def create_demo():
183
+ voices = await get_voices()
184
+ default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)" # 👈 Pick one of the available voices
185
  description = """
186
  Default = male, other voices 1F:US_Emma, 2F:US_Jenny, 3F:HK_Yan, 1M:AU_Will, 2M:IT_Guiseppe,3M:US_Brian, 1C: Childvoice, 1O = OldMan
187
  You can insert silence using the marker 'SS' followed by the duration in seconds (e.g., 'SS1.2' for a 1.2-second pause).
 
190
  """
191
 
192
  demo = gr.Interface(
193
+ fn=tts_interface,
194
+ inputs=[
195
  gr.Textbox(label="Input Text", lines=5, placeholder="Separate paragraphs with two blank lines. Use 'SS[duration]' for silence."),
196
  gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice),
197
  gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
198
  gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
199
  ],
200
+ outputs=[
201
  gr.Audio(label="Generated Audio", type="filepath"),
202
  gr.Markdown(label="Warning", visible=False)
203
  ],
 
212
  # Run the application
213
  if __name__ == "__main__":
214
  demo = asyncio.run(create_demo())
215
+ demo.launch()