import random
import numpy as np
import torch
from chatterbox.src.chatterbox.tts import ChatterboxTTS
import gradio as gr
import spaces
import re

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🚀 Running on device: {DEVICE}")

# --- Global Model Initialization ---
MODEL = None

def get_or_load_model():
    """Loads the ChatterboxTTS model if it hasn't been loaded already,
    and ensures it's on the correct device."""
    global MODEL
    if MODEL is None:
        print("Model not loaded, initializing...")
        try:
            MODEL = ChatterboxTTS.from_pretrained(DEVICE)
            if hasattr(MODEL, 'to') and str(MODEL.device) != DEVICE:
                MODEL.to(DEVICE)
            print(f"Model loaded successfully. Internal device: {getattr(MODEL, 'device', 'N/A')}")
        except Exception as e:
            print(f"Error loading model: {e}")
            raise
    return MODEL

# Attempt to load the model at startup.
try:
    get_or_load_model()
except Exception as e:
    print(f"CRITICAL: Failed to load model on startup. Application may not function. Error: {e}")

def set_seed(seed: int):
    """Sets the random seed for reproducibility across torch, numpy, and random."""
    torch.manual_seed(seed)
    if DEVICE == "cuda":
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    random.seed(seed)
    np.random.seed(seed)

def split_text_into_chunks(text: str, max_chars: int = 250) -> list[str]:
    """
    텍스트를 문장 단위로 나누되, 각 청크가 max_chars를 넘지 않도록 합니다.
    """
    # 문장 단위로 분리 (기본적인 문장 분리)
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        # 현재 청크에 문장을 추가해도 max_chars를 넘지 않으면 추가
        if len(current_chunk) + len(sentence) + 1 <= max_chars:
            if current_chunk:
                current_chunk += " " + sentence
            else:
                current_chunk = sentence
        else:
            # 현재 청크를 저장하고 새 청크 시작
            if current_chunk:
                chunks.append(current_chunk)
            
            # 문장 자체가 max_chars보다 긴 경우 강제로 분할
            if len(sentence) > max_chars:
                words = sentence.split()
                temp_chunk = ""
                for word in words:
                    if len(temp_chunk) + len(word) + 1 <= max_chars:
                        if temp_chunk:
                            temp_chunk += " " + word
                        else:
                            temp_chunk = word
                    else:
                        if temp_chunk:
                            chunks.append(temp_chunk)
                        temp_chunk = word
                if temp_chunk:
                    current_chunk = temp_chunk
            else:
                current_chunk = sentence
    
    # 마지막 청크 추가
    if current_chunk:
        chunks.append(current_chunk)
    
    return chunks

@spaces.GPU
def generate_tts_audio(
    text_input: str,
    audio_prompt_path_input: str,
    exaggeration_input: float,
    temperature_input: float,
    seed_num_input: int,
    cfgw_input: float,
    chunk_size_input: int,
    progress=gr.Progress()
) -> tuple[int, np.ndarray]:
    """
    긴 텍스트를 청크로 나누어 TTS 오디오를 생성하고 연결합니다.
    모든 처리를 단일 GPU 컨텍스트 내에서 수행합니다.
    """
    current_model = get_or_load_model()

    if current_model is None:
        raise RuntimeError("TTS model is not loaded.")

    if seed_num_input != 0:
        set_seed(int(seed_num_input))

    # 텍스트를 청크로 분할
    chunks = split_text_into_chunks(text_input, max_chars=chunk_size_input)
    total_chunks = len(chunks)
    
    print(f"텍스트를 {total_chunks}개의 청크로 분할했습니다.")
    
    # 각 청크에 대해 오디오 생성
    audio_segments = []
    
    for i, chunk in enumerate(chunks):
        progress((i + 1) / total_chunks, f"청크 {i + 1}/{total_chunks} 생성 중...")
        print(f"청크 {i + 1}/{total_chunks} 생성 중: '{chunk[:50]}...'")
        
        try:
            # 직접 generate 메서드 호출 (별도 함수 없이)
            wav = current_model.generate(
                chunk,
                audio_prompt_path=audio_prompt_path_input,
                exaggeration=exaggeration_input,
                temperature=temperature_input,
                cfg_weight=cfgw_input,
            )
            wav_chunk = wav.squeeze(0).numpy()
            audio_segments.append(wav_chunk)
            
        except Exception as e:
            print(f"청크 {i + 1} 생성 중 오류 발생: {e}")
            # 오류 발생 시 계속 진행
            continue
    
    # 모든 오디오 세그먼트 연결
    if audio_segments:
        # 각 청크 사이에 짧은 무음 추가 (선택사항)
        silence_duration = int(0.2 * current_model.sr)  # 0.2초 무음
        silence = np.zeros(silence_duration)
        
        final_audio = []
        for i, segment in enumerate(audio_segments):
            final_audio.append(segment)
            if i < len(audio_segments) - 1:  # 마지막 세그먼트가 아니면 무음 추가
                final_audio.append(silence)
        
        concatenated_audio = np.concatenate(final_audio)
        
        print(f"오디오 생성 완료. 총 길이: {len(concatenated_audio) / current_model.sr:.2f}초")
        return (current_model.sr, concatenated_audio)
    else:
        raise RuntimeError("오디오 생성에 실패했습니다.")

# 단일 청크 생성을 위한 간단한 wrapper 함수 (GPU 데코레이터 포함)
@spaces.GPU
def generate_single_audio(
    text_input: str,
    audio_prompt_path_input: str,
    exaggeration_input: float,
    temperature_input: float,
    seed_num_input: int,
    cfgw_input: float
) -> tuple[int, np.ndarray]:
    """
    단일 텍스트에 대한 TTS 오디오 생성 (300자 이하)
    """
    current_model = get_or_load_model()

    if current_model is None:
        raise RuntimeError("TTS model is not loaded.")

    if seed_num_input != 0:
        set_seed(int(seed_num_input))

    print(f"Generating audio for text: '{text_input[:50]}...'")
    wav = current_model.generate(
        text_input[:300],  # 안전을 위해 300자로 제한
        audio_prompt_path=audio_prompt_path_input,
        exaggeration=exaggeration_input,
        temperature=temperature_input,
        cfg_weight=cfgw_input,
    )
    print("Audio generation complete.")
    return (current_model.sr, wav.squeeze(0).numpy())

with gr.Blocks() as demo:
    gr.Markdown(
        """
        # Chatterbox TTS Demo - 무제한 길이 버전
        긴 텍스트도 청크로 나누어 처리하여 제한 없이 음성을 생성합니다.
        """
    )
    with gr.Row():
        with gr.Column():
            text = gr.Textbox(
                value="Now let's make my mum's favourite. So three mars bars into the pan. Then we add the tuna and just stir for a bit, just let the chocolate and fish infuse. A sprinkle of olive oil and some tomato ketchup. Now smell that. Oh boy this is going to be incredible.",
                label="텍스트 입력 (길이 제한 없음)",
                lines=10,
                max_lines=30
            )
            ref_wav = gr.Audio(
                sources=["upload", "microphone"],
                type="filepath",
                label="Reference Audio File (Optional)",
                value="https://storage.googleapis.com/chatterbox-demo-samples/prompts/female_shadowheart4.flac"
            )
            
            with gr.Row():
                exaggeration = gr.Slider(
                    0.25, 2, step=.05, 
                    label="Exaggeration (Neutral = 0.5)", 
                    value=.5
                )
                cfg_weight = gr.Slider(
                    0.2, 1, step=.05, 
                    label="CFG/Pace", 
                    value=0.5
                )
            
            with gr.Row():
                chunk_size = gr.Slider(
                    100, 300, step=50,
                    label="청크 크기 (문자 수)",
                    value=250,
                    info="텍스트를 나눌 청크의 최대 크기입니다. 작을수록 더 자연스럽지만 처리 시간이 길어집니다."
                )
                mode = gr.Radio(
                    choices=["단일 생성 (300자 이하)", "청크 분할 (무제한)"],
                    value="청크 분할 (무제한)",
                    label="생성 모드"
                )

            with gr.Accordion("고급 옵션", open=False):
                seed_num = gr.Number(value=0, label="Random seed (0 for random)")
                temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)

            run_btn = gr.Button("음성 생성", variant="primary")

        with gr.Column():
            audio_output = gr.Audio(label="생성된 음성")
            
            # 텍스트 길이 표시
            char_count = gr.Textbox(
                label="텍스트 정보",
                value="0 문자, 약 0개 청크",
                interactive=False
            )

    # 텍스트 입력 시 문자 수와 예상 청크 수 업데이트
    def update_char_count(text, chunk_size, mode):
        char_len = len(text)
        if mode == "단일 생성 (300자 이하)":
            if char_len > 300:
                return f"{char_len} 문자 (⚠️ 300자 초과 - 잘릴 수 있음)"
            else:
                return f"{char_len} 문자"
        else:
            chunks = split_text_into_chunks(text, max_chars=chunk_size)
            chunk_count = len(chunks)
            return f"{char_len} 문자, 약 {chunk_count}개 청크로 분할됨"
    
    text.change(
        fn=update_char_count,
        inputs=[text, chunk_size, mode],
        outputs=[char_count]
    )
    
    chunk_size.change(
        fn=update_char_count,
        inputs=[text, chunk_size, mode],
        outputs=[char_count]
    )
    
    mode.change(
        fn=update_char_count,
        inputs=[text, chunk_size, mode],
        outputs=[char_count]
    )

    # 모드에 따라 다른 함수 호출
    def process_audio(text, ref_wav, exaggeration, temp, seed_num, cfg_weight, chunk_size, mode):
        if mode == "단일 생성 (300자 이하)":
            return generate_single_audio(text, ref_wav, exaggeration, temp, seed_num, cfg_weight)
        else:
            return generate_tts_audio(text, ref_wav, exaggeration, temp, seed_num, cfg_weight, chunk_size)

    run_btn.click(
        fn=process_audio,
        inputs=[
            text,
            ref_wav,
            exaggeration,
            temp,
            seed_num,
            cfg_weight,
            chunk_size,
            mode
        ],
        outputs=[audio_output],
    )
    
    gr.Markdown(
        """
        ### 사용 팁:
        - **단일 생성 모드**: 300자 이하의 짧은 텍스트에 적합하며 빠르게 생성됩니다
        - **청크 분할 모드**: 긴 텍스트를 자동으로 여러 부분으로 나누어 처리합니다
        - 청크 크기를 조절하여 품질과 속도의 균형을 맞출 수 있습니다
        - 각 청크 사이에는 자연스러운 전환을 위해 짧은 무음이 추가됩니다
        - 매우 긴 텍스트의 경우 처리 시간이 오래 걸릴 수 있습니다
        """
    )

demo.launch()