Spaces:

hivecorp
/

master-tts-pro

Running

File size: 6,944 Bytes

f7e1683
0ce84cc
 
 
f4b5c65
0ce84cc
 
 
f5e4024
0ce84cc
 
f5e4024
 
 
 
 
 
 
 
 
 
0ce84cc
 
ec46cb3
f5e4024
95d954d
 
0ce84cc
f5e4024
95d954d
 
 
 
4b97382
95d954d
 
4b97382
95d954d
 
 
 
4b97382
95d954d
 
 
 
4b97382
0ce84cc
 
f5e4024
0ce84cc
 
 
 
 
 
f5e4024
0ce84cc
 
f5e4024
0ce84cc
 
 
 
 
 
 
 
 
 
f5e4024
0ce84cc
 
 
 
 
 
f5e4024
0ce84cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f5e4024
0ce84cc
 
 
 
 
 
 
 
 
 
 
 
f4b5c65
bccb8c6
0ce84cc
 
 
 
 
7697af6
0ce84cc
 
f5e4024
0ce84cc
a0f7708
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b97382
 
0ce84cc
 
 
c14c0c8
0ce84cc
 
f1e232e
0ce84cc
 
 
 
 
 
3927c7f
f5e4024
8b3735e
0ce84cc
3927c7f
077e0e7
0ce84cc

import gradio as gr
from pydub import AudioSegment
import edge_tts
import os
import asyncio
import uuid
import re

# Function to get the length of an audio file in milliseconds
def get_audio_length(audio_file):
    audio = AudioSegment.from_file(audio_file)
    return len(audio) / 1000  # Return in seconds for compatibility

# Function to format time for SRT in milliseconds
def format_time_ms(milliseconds):
    seconds, ms = divmod(int(milliseconds), 1000)
    mins, secs = divmod(seconds, 60)
    hrs, mins = divmod(mins, 60)
    return f"{hrs:02}:{mins:02}:{secs:02},{ms:03}"

# Function to split text into segments based on punctuation, ensuring no word is split
def split_text_into_segments(text):
    segments = []
    raw_segments = re.split(r'([.!?,])', text)
    
    for i in range(0, len(raw_segments) - 1, 2):
        sentence = raw_segments[i].strip() + raw_segments[i + 1]
        words = sentence.split()
        
        if len(words) <= 8:
            segments.append(sentence.strip())
        else:
            chunk = ""
            for word in words:
                if len(chunk.split()) < 8:
                    chunk += " " + word
                else:
                    segments.append(chunk.strip())
                    chunk = word
            if chunk:
                segments.append(chunk.strip())

    if len(raw_segments) % 2 == 1:
        remaining_text = raw_segments[-1].strip()
        if remaining_text:
            segments.append(remaining_text)

    return segments

# Function to generate SRT with millisecond accuracy per batch
async def generate_accurate_srt(batch_text, batch_num, start_offset, pitch, rate, voice):
    audio_file = f"batch_{batch_num}_audio.wav"
    
    tts = edge_tts.Communicate(batch_text, voice, rate=rate, pitch=pitch)
    await tts.save(audio_file)

    actual_length = get_audio_length(audio_file) * 1000  # Convert to milliseconds

    segments = split_text_into_segments(batch_text)
    segment_duration = actual_length / len(segments)
    start_time = start_offset

    srt_content = ""
    for index, segment in enumerate(segments):
        end_time = start_time + segment_duration
        
        if end_time > start_offset + actual_length:
            end_time = start_offset + actual_length

        srt_content += f"{index + 1 + (batch_num * 100)}\n"
        srt_content += f"{format_time_ms(start_time)} --> {format_time_ms(end_time)}\n"
        srt_content += segment + "\n\n"
        
        start_time = end_time

    return srt_content, audio_file, start_time

# Batch processing function with millisecond accuracy
async def batch_process_srt_and_audio(script_text, pitch, rate, voice, progress=gr.Progress()):
    batches = [script_text[i:i + 500] for i in range(0, len(script_text), 500)]
    all_srt_content = ""
    combined_audio = AudioSegment.empty()
    start_offset = 0.0  

    for batch_num, batch_text in enumerate(batches):
        srt_content, audio_file, end_offset = await generate_accurate_srt(batch_text, batch_num, start_offset, pitch, rate, voice)
        all_srt_content += srt_content

        batch_audio = AudioSegment.from_file(audio_file)
        combined_audio += batch_audio
        start_offset = end_offset

        os.remove(audio_file)
        progress((batch_num + 1) / len(batches))

    total_audio_length = combined_audio.duration_seconds
    validated_srt_content = ""
    for line in all_srt_content.strip().splitlines():
        if '-->' in line:
            start_str, end_str = line.split(' --> ')
            start_time = sum(x * float(t) for x, t in zip([3600, 60, 1, 0.001], start_str.replace(',', ':').split(':')))
            end_time = sum(x * float(t) for x, t in zip([3600, 60, 1, 0.001], end_str.replace(',', ':').split(':')))
            if end_time > total_audio_length:
                end_time = total_audio_length
            line = f"{format_time_ms(start_time * 1000)} --> {format_time_ms(end_time * 1000)}"
        validated_srt_content += line + "\n"

    unique_id = uuid.uuid4()
    final_audio_path = f"final_audio_{unique_id}.mp3"
    final_srt_path = f"final_subtitles_{unique_id}.srt"

    combined_audio.export(final_audio_path, format="mp3", bitrate="320k")

    with open(final_srt_path, "w") as srt_file:
        srt_file.write(validated_srt_content)

    return final_srt_path, final_audio_path

# Gradio interface function
async def process_script(script_text, pitch, rate, voice):
    pitch_str = f"{pitch}Hz" if pitch != 0 else "-1Hz"
    formatted_rate = f"{'+' if rate > 1 else ''}{int(rate)}%"  
    srt_path, audio_path = await batch_process_srt_and_audio(script_text, pitch_str, formatted_rate, voice_options[voice])
    return srt_path, audio_path, audio_path

# Gradio interface setup
voice_options = {
    "Andrew Male": "en-US-AndrewNeural",
    "Jenny Female": "en-US-JennyNeural",
    "Guy Male": "en-US-GuyNeural",
    "Ana Female": "en-US-AnaNeural",
    "Aria Female": "en-US-AriaNeural",
    "Brian Male": "en-US-BrianNeural",
    "Christopher Male": "en-US-ChristopherNeural",
    "Eric Male": "en-US-EricNeural",
    "Michelle Male": "en-US-MichelleNeural",
    "Roger Male": "en-US-RogerNeural",
    "Natasha Female": "en-AU-NatashaNeural",
    "William Male": "en-AU-WilliamNeural",
    "Clara Female": "en-CA-ClaraNeural",
    "Liam Female ": "en-CA-LiamNeural",
    "Libby Female": "en-GB-LibbyNeural",
    "Maisie": "en-GB-MaisieNeural",
    "Ryan": "en-GB-RyanNeural",
    "Sonia": "en-GB-SoniaNeural",
    "Thomas": "en-GB-ThomasNeural",
    "Sam": "en-HK-SamNeural",
    "Yan": "en-HK-YanNeural",
    "Connor": "en-IE-ConnorNeural",
    "Emily": "en-IE-EmilyNeural",
    "Neerja": "en-IN-NeerjaNeural",
    "Prabhat": "en-IN-PrabhatNeural",
    "Asilia": "en-KE-AsiliaNeural",
    "Chilemba": "en-KE-ChilembaNeural",
    "Abeo": "en-NG-AbeoNeural",
    "Ezinne": "en-NG-EzinneNeural",
    "Mitchell": "en-NZ-MitchellNeural",
    "James": "en-PH-JamesNeural",
    "Rosa": "en-PH-RosaNeural",
    "Luna": "en-SG-LunaNeural",
    "Wayne": "en-SG-WayneNeural",
    "Elimu": "en-TZ-ElimuNeural",
    "Imani": "en-TZ-ImaniNeural",
    "Leah": "en-ZA-LeahNeural",
    "Luke": "en-ZA-LukeNeural"
    # Add other voices here...
}

app = gr.Interface(
    fn=process_script,
    inputs=[
        gr.Textbox(label="Enter Script Text", lines=10),
        gr.Slider(label="Pitch Adjustment (Hz)", minimum=-20, maximum=20, value=0, step=1),
        gr.Slider(label="Rate Adjustment (%)", minimum=-50, maximum=50, value=-1, step=1),
        gr.Dropdown(label="Select Voice", choices=list(voice_options.keys()), value="Andrew Male"),
    ],
    outputs=[
        gr.File(label="Download SRT File"),
        gr.File(label="Download Audio File"),
        gr.Audio(label="Audio Playback")
    ],
    title="HIVEcorp Text-to-Speech with Millisecond SRT Generation",
    description="Convert your script into Audio and generate Subtitles.",
    theme="compact",
)

app.launch()