import soundfile as sf
import numpy as np

def get_silence(duration_ms=1000):
    # Create silent audio segment with specified parameters
    silent_audio = AudioSegment.silent(
        duration=duration_ms,
        frame_rate=24000  # 24kHz sampling rate
    )
    # Set audio parameters
    silent_audio = silent_audio.set_channels(1)  # Mono
    silent_audio = silent_audio.set_sample_width(4)  # 32-bit (4 bytes per sample)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
        # Export with specific bitrate and codec parameters
        silent_audio.export(
            tmp_file.name,
            format="mp3",
            bitrate="48k",
            parameters=[
                "-ac", "1",  # Mono
                "-ar", "24000",  # Sample rate
                "-sample_fmt", "s32",  # 32-bit samples
                "-codec:a", "libmp3lame"  # MP3 codec
            ]
        )
        return tmp_file.name

# Get all available voices
async def get_voices():

    try:
        voices = await edge_tts.list_voices()
        return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}

async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch, target_duration_ms=None, speed_adjustment_factor=1.0):
    """Generates audio for a text segment, handling voice prefixes and adjusting rate for duration."""
    current_voice_full = default_voice
    current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
    current_rate = rate
    current_pitch = pitch
    processed_text = text_segment.strip()
    print(f"Processing this  text segment: {processed_text}") # Debug
    voice_map = {
        "1F": "en-GB-SoniaNeural",
        "2M": "en-GB-RyanNeural",
        "3M": "en-US-BrianMultilingualNeural",
        "2F": "en-US-JennyNeural",
        "1M": "en-AU-WilliamNeural",
        "3F": "en-HK-YanNeural",
        "4M": "en-GB-ThomasNeural",
        "4F": "en-US-EmmaNeural",
        "1O": "en-GB-RyanNeural",  # Old Man
        "1C": "en-GB-MaisieNeural",  # Child
        "1V": "vi-VN-HoaiMyNeural",  # Vietnamese (Female)
        "2V": "vi-VN-NamMinhNeural",  # Vietnamese (Male)
        "3V": "vi-VN-HoaiMyNeural",  # Vietnamese (Female)
        "4V": "vi-VN-NamMinhNeural",  # Vietnamese (Male)
    }
    detect = 0
    for prefix, voice_short in voice_map.items():
        if processed_text.startswith(prefix):
            current_voice_short = voice_short
            if prefix in ["1F", "3F", "1V", "3V"]:
            elif prefix in ["1O", "4V"]:
                current_pitch = -20
                current_rate = -10
            detect = 1
            processed_text = processed_text[len(prefix):].strip()
            break

    match = re.search(r'([A-Za-z]+)-?(\d+)', processed_text)
    if match:
        prefix_pitch = match.group(1)
        number = int(match.group(2))
        if prefix_pitch in voice_map:
            current_pitch += number
            processed_text = re.sub(r'[A-Za-z]+-?\d+', '', processed_text, count=1).strip()
        elif detect:
            processed_text = processed_text.lstrip('-0123456789').strip() # Remove potential leftover numbers
    elif detect:
        processed_text = processed_text[2:].strip()
    if processed_text:
        rate_str = f"{current_rate:+d}%"
        pitch_str = f"{current_pitch:+d}Hz"
            with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
                audio_path = tmp_file.name
                await communicate.save(audio_path)
            if target_duration_ms is not None and os.path.exists(audio_path):
                audio = AudioSegment.from_mp3(audio_path)
                audio_duration_ms = len(audio)
                #print(f"Generated audio duration: {audio_duration_ms}ms, Target duration: {target_duration_ms}ms") # Debug
                if audio_duration_ms > target_duration_ms and target_duration_ms > 0:
                    speed_factor = (audio_duration_ms / target_duration_ms) * speed_adjustment_factor
                    #print(f"Speed factor (after user adjustment): {speed_factor}") # Debug
                    if speed_factor > 0:
                        if speed_factor < 1.0:
                            speed_factor = 1.0
                        y, sr = librosa.load(audio_path, sr=None)
                        y_stretched = librosa.effects.time_stretch(y, rate=speed_factor)
                        sf.write(audio_path, y_stretched, sr)
                else:
                    print("Generated audio is not longer than target duration, no speed adjustment.") # Debug
            return audio_path
        except Exception as e:
            print(f"Edge TTS error processing '{processed_text}': {e}")
            return None
    return None

async def process_transcript_line(line, default_voice, rate, pitch, speed_adjustment_factor):
    """Processes a single transcript line with HH:MM:SS,milliseconds - HH:MM:SS,milliseconds timestamp."""
    match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+-\s+(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+(.*)', line)
    if match:
        start_h, start_m, start_s, start_ms, end_h, end_m, end_s, end_ms, text_parts = match.groups()
        start_time_ms = (
            int(start_h) * 3600000 +
            int(start_m) * 60000 +
            int(start_s) * 1000 +
            int(start_ms)
        )
        end_time_ms = (
            int(end_h) * 3600000 +
            int(end_m) * 60000 +
            int(end_s) * 1000 +
            int(end_ms)
        )
        duration_ms = end_time_ms - start_time_ms

        audio_segments = []
        split_parts = re.split(r'[“”"]', text_parts)
        process_next = False
        for part in split_parts:
            if part == '"':
                process_next = not process_next
                continue
            if process_next and part.strip():
                audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, duration_ms, speed_adjustment_factor)
                if audio_path:
                    audio_segments.append(audio_path)
            elif not process_next and part.strip():
                audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, duration_ms, speed_adjustment_factor)
                if audio_path:
                    audio_segments.append(audio_path)
        return start_time_ms, audio_segments, duration_ms
    return None, None, None

async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjustment_factor):

    if not transcript_text.strip():
        return None, gr.Warning("Please enter transcript text.")
    if not voice:
        return None, gr.Warning("Please select a voice.")

    lines = transcript_text.strip().split('\n')
    timed_audio_segments = []
    max_end_time_ms = 0
    for line in lines:
        start_time, audio_paths, duration = await process_transcript_line(line, voice, rate, pitch, speed_adjustment_factor)
        if start_time is not None and audio_paths:
            combined_line_audio = AudioSegment.empty()
            current_time_ms = start_time
            segment_duration = duration / len(audio_paths) if audio_paths else 0
            for path in audio_paths:
                if path:  # Only process if audio_path is not None (meaning TTS was successful)
                    try:
                        audio = AudioSegment.from_mp3(path)
                        combined_line_audio += audio
                        os.remove(path)
                    except FileNotFoundError:
                        print(f"Warning: Audio file not found: {path}")
            if combined_line_audio:
                timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
                max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
        elif audio_paths:
            for path in audio_paths:
                if path:
                    try:
                        os.remove(path)
                    except FileNotFoundError:
                        pass # Clean up even if no timestamp
    if not timed_audio_segments:
        return None, "No processable audio segments found."
    final_audio = AudioSegment.silent(duration=max_end_time_ms, frame_rate=24000)
    for segment in timed_audio_segments:
        final_audio = final_audio.overlay(segment['audio'], position=segment['start'])
    combined_audio_path = tempfile.mktemp(suffix=".mp3")
    final_audio.export(combined_audio_path, format="mp3")
    return combined_audio_path, None

@spaces.GPU
def tts_interface(transcript, voice, rate, pitch, speed_adjustment_factor):

    audio, warning = asyncio.run(transcript_to_speech(transcript, voice, rate, pitch, speed_adjustment_factor))
    return audio, warning

async def create_demo():

    voices = await get_voices()
    default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
    description = """