cnph001 commited on
Commit
ff3ad52
·
verified ·
1 Parent(s): 298c01a

Revert to previous code

Browse files
Files changed (1) hide show
  1. app.py +133 -172
app.py CHANGED
@@ -1,133 +1,150 @@
1
- import spaces
2
- import gradio as gr
3
- import edge_tts
4
- import asyncio
5
- import tempfile
6
- import os
7
- import re
8
- from pathlib import Path
9
- from pydub import AudioSegment
10
- import librosa
11
  import soundfile as sf
12
  import numpy as np
13
 
14
- # Global constant for voice mapping
15
- VOICE_MAP = {
16
- "1F": "en-GB-SoniaNeural",
17
- "2M": "en-GB-RyanNeural",
18
- "3M": "en-US-BrianMultilingualNeural",
19
- "2F": "en-US-JennyNeural",
20
- "1M": "en-AU-WilliamNeural",
21
- "3F": "en-HK-YanNeural",
22
- "4M": "en-GB-ThomasNeural",
23
- "4F": "en-US-EmmaNeural",
24
- "1O": "en-GB-RyanNeural", # Old Man
25
- "1C": "en-GB-MaisieNeural", # Child
26
- "1V": "vi-VN-HoaiMyNeural", # Vietnamese (Female)
27
- "2V": "vi-VN-NamMinhNeural", # Vietnamese (Male)
28
- "3V": "vi-VN-HoaiMyNeural", # Vietnamese (Female)
29
- "4V": "vi-VN-NamMinhNeural", # Vietnamese (Male)
30
- }
31
-
32
  def get_silence(duration_ms=1000):
33
- """Creates a silent AudioSegment."""
34
- return AudioSegment.silent(
35
  duration=duration_ms,
36
- frame_rate=24000,
37
- sample_width=4,
38
- channels=1
39
  )
40
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  async def get_voices():
42
- """Lists available Edge TTS voices."""
43
  try:
44
  voices = await edge_tts.list_voices()
45
  return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
46
- except Exception as e:
47
- print(f"Error listing voices: {e}")
48
- return {}
49
 
50
  async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch, target_duration_ms=None, speed_adjustment_factor=1.0):
51
  """Generates audio for a text segment, handling voice prefixes and adjusting rate for duration."""
52
- processed_text = text_segment.strip()
53
- current_voice_short = default_voice.split(" - ")[0] if default_voice else ""
54
  current_rate = rate
55
  current_pitch = pitch
56
-
57
- for prefix, voice_short in VOICE_MAP.items():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  if processed_text.startswith(prefix):
59
  current_voice_short = voice_short
60
  if prefix in ["1F", "3F", "1V", "3V"]:
61
- current_pitch = 25
62
  elif prefix in ["1O", "4V"]:
63
  current_pitch = -20
64
  current_rate = -10
 
65
  processed_text = processed_text[len(prefix):].strip()
66
  break
67
 
68
  match = re.search(r'([A-Za-z]+)-?(\d+)', processed_text)
69
- if match and match.group(1) in VOICE_MAP:
70
- pitch_adjustment = int(match.group(2))
71
- current_pitch += pitch_adjustment
72
- processed_text = re.sub(r'[A-Za-z]+-?\d+', '', processed_text, count=1).strip()
73
- elif any(processed_text.startswith(prefix) for prefix in VOICE_MAP): # Handle leftover prefixes
74
- processed_text = re.sub(r'^[A-Za-z]{1,2}', '', processed_text).lstrip('-').strip()
75
-
 
 
 
76
  if processed_text:
77
  rate_str = f"{current_rate:+d}%"
78
  pitch_str = f"{current_pitch:+d}Hz"
79
- try:
80
- communicate = edge_tts.Communicate(processed_text, current_voice_short, rate=rate_str, pitch=pitch_str)
81
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
82
  audio_path = tmp_file.name
83
  await communicate.save(audio_path)
84
-
85
- if target_duration_ms is not None and os.path.exists(audio_path) and target_duration_ms > 0:
86
- audio = AudioSegment.from_mp3(audio_path)
87
- audio_duration_ms = len(audio)
88
- if audio_duration_ms > target_duration_ms:
89
- speed_factor = (audio_duration_ms / target_duration_ms) * speed_adjustment_factor
90
- if speed_factor > 0 and speed_factor >= 1.0:
91
- y, sr = librosa.load(audio_path, sr=None)
92
- y_stretched = librosa.effects.time_stretch(y, rate=speed_factor)
93
- sf.write(audio_path, y_stretched, sr)
94
- return audio_path
 
 
 
 
 
95
  except Exception as e:
96
  print(f"Edge TTS error processing '{processed_text}': {e}")
97
  return None
98
  return None
99
 
100
  async def process_transcript_line(line, default_voice, rate, pitch, speed_adjustment_factor):
101
- """Processes a single transcript line with timestamp and potential voice changes."""
102
- match = re.match(r'(\d{2}:\d{2}:\d{2},\d{3})\s+-\s+(\d{2}:\d{2}:\d{2},\d{3})\s+(.*)', line)
103
  if match:
104
- start_time_str, end_time_str, text_parts = match.groups()
105
-
106
- def time_str_to_ms(time_str):
107
- h, m, s_ms = time_str.split(':')
108
- s, ms = s_ms.split(',')
109
- return int(h) * 3600000 + int(m) * 60000 + int(s) * 1000 + int(ms)
110
-
111
- start_time_ms = time_str_to_ms(start_time_str)
112
- end_time_ms = time_str_to_ms(end_time_str)
 
 
 
 
113
  duration_ms = end_time_ms - start_time_ms
114
 
115
  audio_segments = []
116
- parts = re.split(r'([“”"])', text_parts)
117
- in_quote = False
118
- for part in parts:
119
  if part == '"':
120
- in_quote = not in_quote
121
  continue
122
- if part.strip():
123
- audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, duration_ms, speed_adjustment_factor if in_quote else 1.0)
 
 
 
 
124
  if audio_path:
125
  audio_segments.append(audio_path)
126
  return start_time_ms, audio_segments, duration_ms
127
  return None, None, None
128
 
129
  async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjustment_factor):
130
- """Converts a timestamped transcript with voice changes to a single audio file."""
131
  if not transcript_text.strip():
132
  return None, gr.Warning("Please enter transcript text.")
133
  if not voice:
@@ -136,103 +153,47 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjust
136
  lines = transcript_text.strip().split('\n')
137
  timed_audio_segments = []
138
  max_end_time_ms = 0
139
-
140
- with tempfile.TemporaryDirectory() as tmpdir:
141
- for line in lines:
142
- start_time, audio_paths, duration = await process_transcript_line(line, voice, rate, pitch, speed_adjustment_factor)
143
- if start_time is not None and audio_paths:
144
- combined_line_audio = AudioSegment.empty()
145
- for path in audio_paths:
146
- if path and os.path.exists(path):
147
- try:
148
- audio = AudioSegment.from_mp3(path)
149
- combined_line_audio += audio
150
- except FileNotFoundError:
151
- print(f"Warning: Audio file not found: {path}")
152
- finally:
153
- try:
154
- os.remove(path)
155
- except OSError:
156
- print(f"Warning: Could not remove temporary file: {path}")
157
- if combined_line_audio:
158
- timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
159
- max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
160
- elif audio_paths:
161
- for path in audio_paths:
162
- if path:
163
- try:
164
- os.remove(path)
165
- except FileNotFoundError:
166
- pass # Clean up even if no timestamp
167
-
168
- if not timed_audio_segments:
169
- return None, "No processable audio segments found."
170
-
171
- final_audio = AudioSegment.silent(duration=max_end_time_ms, frame_rate=24000)
172
- for segment in timed_audio_segments:
173
- final_audio = final_audio.overlay(segment['audio'], position=segment['start'])
174
-
175
- combined_audio_path = Path(tmpdir) / "combined_audio.mp3"
176
- final_audio.export(str(combined_audio_path), format="mp3")
177
- return str(combined_audio_path), None
178
 
179
  @spaces.GPU
180
  def tts_interface(transcript, voice, rate, pitch, speed_adjustment_factor):
181
- """Gradio interface function for TTS."""
182
  audio, warning = asyncio.run(transcript_to_speech(transcript, voice, rate, pitch, speed_adjustment_factor))
183
  return audio, warning
184
 
185
  async def create_demo():
186
- """Creates the Gradio demo interface."""
187
  voices = await get_voices()
188
  default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
189
- description = """
190
- Process timestamped text (HH:MM:SS,milliseconds - HH:MM:SS,milliseconds) with voice changes within quotes.
191
- The duration specified in the timestamp will be used to adjust the speech rate so the generated audio fits within that time.
192
- You can control the intensity of the speed adjustment using the "Speed Adjustment Factor" slider.
193
- Format: `HH:MM:SS,milliseconds - HH:MM:SS,milliseconds "VoicePrefix Text" more text "AnotherVoicePrefix More Text"`
194
- Example:
195
- ```
196
- 00:00:00,000 - 00:00:05,000 "This is the default voice." more default. "1F Now a female voice." and back to default.
197
- 00:00:05,500 - 00:00:10,250 "1C Yes," said the child, "it is fun!"
198
- ```
199
- ***************************************************************************************************
200
- 1M = en-AU-WilliamNeural - en-AU (Male)
201
- 1F = en-GB-SoniaNeural - en-GB (Female)
202
- 2M = en-GB-RyanNeural - en-GB (Male)
203
- 2F = en-US-JennyNeural - en-US (Female)
204
- 3M = en-US-BrianMultilingualNeural - en-US (Male)
205
- 3F = en-HK-YanNeural - en-HK (Female)
206
- 4M = en-GB-ThomasNeural - en-GB (Male)
207
- 4F = en-US-EmmaNeural - en-US (Female)
208
- 1O = en-GB-RyanNeural - en-GB (Male) # Old Man
209
- 1C = en-GB-MaisieNeural - en-GB (Female) # Child
210
- 1V = vi-VN-HoaiMyNeural - vi-VN (Female) # Vietnamese (Female)
211
- 2V = vi-VN-NamMinhNeural - vi-VN (Male) # Vietnamese (Male)
212
- 3V = vi-VN-HoaiMyNeural - vi-VN (Female) # Vietnamese (Female)
213
- 4V = vi-VN-NamMinhNeural - vi-VN (Male) # Vietnamese (Male)
214
- ****************************************************************************************************
215
- """
216
- demo = gr.Interface(
217
- fn=tts_interface,
218
- inputs=[
219
- gr.Textbox(label="Timestamped Text with Voice Changes and Duration", lines=10, placeholder='00:00:00,000 - 00:00:05,000 "Text" more text "1F Different Voice"'),
220
- gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Default Voice", value=default_voice),
221
- gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
222
- gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1),
223
- gr.Slider(minimum=0.5, maximum=1.5, value=1.0, step=0.05, label="Speed Adjustment Factor")
224
- ],
225
- outputs=[
226
- gr.Audio(label="Generated Audio", type="filepath"),
227
- gr.Markdown(label="Warning", visible=False)
228
- ],
229
- title="TTS with Duration-Aware Speed Adjustment and In-Quote Voice Switching",
230
- description=description,
231
- analytics_enabled=False,
232
- allow_flagging=False
233
- )
234
- return demo
235
-
236
- if __name__ == "__main__":
237
- demo = asyncio.run(create_demo())
238
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
1
  import soundfile as sf
2
  import numpy as np
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  def get_silence(duration_ms=1000):
5
+ # Create silent audio segment with specified parameters
6
+ silent_audio = AudioSegment.silent(
7
  duration=duration_ms,
8
+ frame_rate=24000 # 24kHz sampling rate
 
 
9
  )
10
+ # Set audio parameters
11
+ silent_audio = silent_audio.set_channels(1) # Mono
12
+ silent_audio = silent_audio.set_sample_width(4) # 32-bit (4 bytes per sample)
13
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
14
+ # Export with specific bitrate and codec parameters
15
+ silent_audio.export(
16
+ tmp_file.name,
17
+ format="mp3",
18
+ bitrate="48k",
19
+ parameters=[
20
+ "-ac", "1", # Mono
21
+ "-ar", "24000", # Sample rate
22
+ "-sample_fmt", "s32", # 32-bit samples
23
+ "-codec:a", "libmp3lame" # MP3 codec
24
+ ]
25
+ )
26
+ return tmp_file.name
27
+
28
+ # Get all available voices
29
  async def get_voices():
30
+
31
  try:
32
  voices = await edge_tts.list_voices()
33
  return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
 
 
 
34
 
35
  async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch, target_duration_ms=None, speed_adjustment_factor=1.0):
36
  """Generates audio for a text segment, handling voice prefixes and adjusting rate for duration."""
37
+ current_voice_full = default_voice
38
+ current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
39
  current_rate = rate
40
  current_pitch = pitch
41
+ processed_text = text_segment.strip()
42
+ print(f"Processing this text segment: {processed_text}") # Debug
43
+ voice_map = {
44
+ "1F": "en-GB-SoniaNeural",
45
+ "2M": "en-GB-RyanNeural",
46
+ "3M": "en-US-BrianMultilingualNeural",
47
+ "2F": "en-US-JennyNeural",
48
+ "1M": "en-AU-WilliamNeural",
49
+ "3F": "en-HK-YanNeural",
50
+ "4M": "en-GB-ThomasNeural",
51
+ "4F": "en-US-EmmaNeural",
52
+ "1O": "en-GB-RyanNeural", # Old Man
53
+ "1C": "en-GB-MaisieNeural", # Child
54
+ "1V": "vi-VN-HoaiMyNeural", # Vietnamese (Female)
55
+ "2V": "vi-VN-NamMinhNeural", # Vietnamese (Male)
56
+ "3V": "vi-VN-HoaiMyNeural", # Vietnamese (Female)
57
+ "4V": "vi-VN-NamMinhNeural", # Vietnamese (Male)
58
+ }
59
+ detect = 0
60
+ for prefix, voice_short in voice_map.items():
61
  if processed_text.startswith(prefix):
62
  current_voice_short = voice_short
63
  if prefix in ["1F", "3F", "1V", "3V"]:
 
64
  elif prefix in ["1O", "4V"]:
65
  current_pitch = -20
66
  current_rate = -10
67
+ detect = 1
68
  processed_text = processed_text[len(prefix):].strip()
69
  break
70
 
71
  match = re.search(r'([A-Za-z]+)-?(\d+)', processed_text)
72
+ if match:
73
+ prefix_pitch = match.group(1)
74
+ number = int(match.group(2))
75
+ if prefix_pitch in voice_map:
76
+ current_pitch += number
77
+ processed_text = re.sub(r'[A-Za-z]+-?\d+', '', processed_text, count=1).strip()
78
+ elif detect:
79
+ processed_text = processed_text.lstrip('-0123456789').strip() # Remove potential leftover numbers
80
+ elif detect:
81
+ processed_text = processed_text[2:].strip()
82
  if processed_text:
83
  rate_str = f"{current_rate:+d}%"
84
  pitch_str = f"{current_pitch:+d}Hz"
 
 
85
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
86
  audio_path = tmp_file.name
87
  await communicate.save(audio_path)
88
+ if target_duration_ms is not None and os.path.exists(audio_path):
89
+ audio = AudioSegment.from_mp3(audio_path)
90
+ audio_duration_ms = len(audio)
91
+ #print(f"Generated audio duration: {audio_duration_ms}ms, Target duration: {target_duration_ms}ms") # Debug
92
+ if audio_duration_ms > target_duration_ms and target_duration_ms > 0:
93
+ speed_factor = (audio_duration_ms / target_duration_ms) * speed_adjustment_factor
94
+ #print(f"Speed factor (after user adjustment): {speed_factor}") # Debug
95
+ if speed_factor > 0:
96
+ if speed_factor < 1.0:
97
+ speed_factor = 1.0
98
+ y, sr = librosa.load(audio_path, sr=None)
99
+ y_stretched = librosa.effects.time_stretch(y, rate=speed_factor)
100
+ sf.write(audio_path, y_stretched, sr)
101
+ else:
102
+ print("Generated audio is not longer than target duration, no speed adjustment.") # Debug
103
+ return audio_path
104
  except Exception as e:
105
  print(f"Edge TTS error processing '{processed_text}': {e}")
106
  return None
107
  return None
108
 
109
  async def process_transcript_line(line, default_voice, rate, pitch, speed_adjustment_factor):
110
+ """Processes a single transcript line with HH:MM:SS,milliseconds - HH:MM:SS,milliseconds timestamp."""
111
+ match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+-\s+(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+(.*)', line)
112
  if match:
113
+ start_h, start_m, start_s, start_ms, end_h, end_m, end_s, end_ms, text_parts = match.groups()
114
+ start_time_ms = (
115
+ int(start_h) * 3600000 +
116
+ int(start_m) * 60000 +
117
+ int(start_s) * 1000 +
118
+ int(start_ms)
119
+ )
120
+ end_time_ms = (
121
+ int(end_h) * 3600000 +
122
+ int(end_m) * 60000 +
123
+ int(end_s) * 1000 +
124
+ int(end_ms)
125
+ )
126
  duration_ms = end_time_ms - start_time_ms
127
 
128
  audio_segments = []
129
+ split_parts = re.split(r'[“”"]', text_parts)
130
+ process_next = False
131
+ for part in split_parts:
132
  if part == '"':
133
+ process_next = not process_next
134
  continue
135
+ if process_next and part.strip():
136
+ audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, duration_ms, speed_adjustment_factor)
137
+ if audio_path:
138
+ audio_segments.append(audio_path)
139
+ elif not process_next and part.strip():
140
+ audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, duration_ms, speed_adjustment_factor)
141
  if audio_path:
142
  audio_segments.append(audio_path)
143
  return start_time_ms, audio_segments, duration_ms
144
  return None, None, None
145
 
146
  async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjustment_factor):
147
+
148
  if not transcript_text.strip():
149
  return None, gr.Warning("Please enter transcript text.")
150
  if not voice:
 
153
  lines = transcript_text.strip().split('\n')
154
  timed_audio_segments = []
155
  max_end_time_ms = 0
156
+ for line in lines:
157
+ start_time, audio_paths, duration = await process_transcript_line(line, voice, rate, pitch, speed_adjustment_factor)
158
+ if start_time is not None and audio_paths:
159
+ combined_line_audio = AudioSegment.empty()
160
+ current_time_ms = start_time
161
+ segment_duration = duration / len(audio_paths) if audio_paths else 0
162
+ for path in audio_paths:
163
+ if path: # Only process if audio_path is not None (meaning TTS was successful)
164
+ try:
165
+ audio = AudioSegment.from_mp3(path)
166
+ combined_line_audio += audio
167
+ os.remove(path)
168
+ except FileNotFoundError:
169
+ print(f"Warning: Audio file not found: {path}")
170
+ if combined_line_audio:
171
+ timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
172
+ max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
173
+ elif audio_paths:
174
+ for path in audio_paths:
175
+ if path:
176
+ try:
177
+ os.remove(path)
178
+ except FileNotFoundError:
179
+ pass # Clean up even if no timestamp
180
+ if not timed_audio_segments:
181
+ return None, "No processable audio segments found."
182
+ final_audio = AudioSegment.silent(duration=max_end_time_ms, frame_rate=24000)
183
+ for segment in timed_audio_segments:
184
+ final_audio = final_audio.overlay(segment['audio'], position=segment['start'])
185
+ combined_audio_path = tempfile.mktemp(suffix=".mp3")
186
+ final_audio.export(combined_audio_path, format="mp3")
187
+ return combined_audio_path, None
 
 
 
 
 
 
 
188
 
189
  @spaces.GPU
190
  def tts_interface(transcript, voice, rate, pitch, speed_adjustment_factor):
191
+
192
  audio, warning = asyncio.run(transcript_to_speech(transcript, voice, rate, pitch, speed_adjustment_factor))
193
  return audio, warning
194
 
195
  async def create_demo():
196
+
197
  voices = await get_voices()
198
  default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
199
+ description = """