import gradio as gr
import os
import json
import torch
import soundfile as sf
import numpy as np
from pathlib import Path
from transformers import AutoModel
#from utils.llm import get_time_info
from utils.llm_xiapi import get_time_info
#retry1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained("rookie9/PicoAudio2", trust_remote_code=True).to(device)
def is_tdc_format_valid(tdc_str):
    try:
        for event_onset in tdc_str.split('--'):
            event, instance = event_onset.split('__')
            for start_end in instance.split('_'):
                start, end = start_end.split('-')
        return True
    except Exception:
        return False

def a_to_b(a_str):
    items = a_str.split(';')
    result = []
    for item in items:
        item = item.strip()
        if not item:
            continue
        if '(' in item and ')' in item:
            name, times = item.split('(', 1)
            name = name.strip().replace(' ', '_')
            times = times.strip(')').replace(', ', '_').replace(',', '_')
            result.append(f"{name}__{times}")
    return '--'.join(result)

def b_to_a(b_str):
    events = b_str.split('--')
    result = []
    for e in events:
        if '__' not in e:
            continue
        name, times = e.split('__', 1)
        name = name.replace('_', ' ')
        times = times.replace('_', ', ')
        result.append(f"{name}({times})")
    return '; '.join(result)

def convert_tdc_to_tcc(b_str):
    events = b_str.split('--')
    names = []
    for e in events:
        if '__' not in e:
            continue
        name, _ = e.split('__', 1)
        name = name.replace('_', ' ')
        names.append(name)
    return ' and '.join(names)

def infer(input_text, input_onset, input_length, time_control):
    # para
    input_onset = a_to_b(input_onset)
    if not input_text and input_onset and is_tdc_format_valid(input_onset):
        input_text = convert_tdc_to_tcc(input_onset)
    elif not input_text:
        input_text = "a dog barks"
    
    if input_onset and not is_tdc_format_valid(input_onset):
        input_onset = "random"
    
    if time_control:
        if not input_onset or not input_length:
            input_json = json.loads(get_time_info(input_text))
            input_onset, input_length = input_json["onset"], input_json["length"]
    else:
        input_onset = input_onset if input_onset else "random"
        input_length = input_length if input_length else "10.0"

    content = {
        "caption": input_text,
        "onset": input_onset,
        "length": input_length
    }
    

    with torch.no_grad():
        waveform = model(content)
        output_wav = "output.wav"
        sf.write(
            output_wav,
            waveform[0, 0].cpu().numpy(),
            samplerate=24000,
        )
    #input_onset = b_to_a(input_onset)
    return output_wav, str(input_onset)

demo = gr.Interface(
    fn=infer,
    inputs=[
        gr.Textbox(label="TCC (necessary)", value="a dog barks"),
        gr.Textbox(label="TDC (optional, see format)", value="a dog barks(3.0-4.0, 6.0-7.0)"),
        gr.Textbox(label="Length (seconds, optional)", value="10.0"),
        gr.Checkbox(label="Enable Time Control", value=False),
    ],
    outputs=[
        gr.Audio(label="Generated Audio"),
        gr.Textbox(label="Final TDC Used (input_onset)")
    ],
    title="PicoAudio2 Online Inference",
    description="""
## Definition

**TCC (Temporal Coarse Caption):**  
A brief text description for the overall audio scene.  
*Example*: `a dog barks`

**TDC (Temporal Detailed Caption):**  
A **caption with timestamp information** for each event.  
It allows precise temporal control over when events happen in the generated audio.  
*Example*: `a dog barks(1.0-2.0, 3.0-4.0); a man speaks(5.0-6.0)`

---

## Input Requirements & Format

- **TCC** is **required** for audio generation.
- **TDC** is **optional**. If provided, it should follow the format:  `event1(start1-end1, start2-end2); event2(start1-end1, ...)`
- **Length** (in seconds) is optional, but recommended for temporal control. The length defaults to 10.0 seconds.
- **Enable Time Control**: Tick to use TDC and length for precise event timing.

---

## Notes

- If TDC format is incorrect or length is missing, the model will generate audio **without precise temporal control**. 
- For general audio generation, it is recommended to input `random` for TDC.
- You may leave TDC blank to let the LLM generate timestamps automatically (subject to API quota).
---
"""
)
if __name__ == "__main__":
    demo.launch()