import gradio as gr import os import json import torch import soundfile as sf import numpy as np from pathlib import Path from transformers import AutoModel #from utils.llm import get_time_info from utils.llm_xiapi import get_time_info #retry1 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = AutoModel.from_pretrained("rookie9/PicoAudio2", trust_remote_code=True).to(device) def is_tdc_format_valid(tdc_str): try: for event_onset in tdc_str.split('--'): event, instance = event_onset.split('__') for start_end in instance.split('_'): start, end = start_end.split('-') return True except Exception: return False def a_to_b(a_str): items = a_str.split(';') result = [] for item in items: item = item.strip() if not item: continue if '(' in item and ')' in item: name, times = item.split('(', 1) name = name.strip().replace(' ', '_') times = times.strip(')').replace(', ', '_').replace(',', '_') result.append(f"{name}__{times}") return '--'.join(result) def b_to_a(b_str): events = b_str.split('--') result = [] for e in events: if '__' not in e: continue name, times = e.split('__', 1) name = name.replace('_', ' ') times = times.replace('_', ', ') result.append(f"{name}({times})") return '; '.join(result) def convert_tdc_to_tcc(b_str): events = b_str.split('--') names = [] for e in events: if '__' not in e: continue name, _ = e.split('__', 1) name = name.replace('_', ' ') names.append(name) return ' and '.join(names) def infer(input_text, input_onset, input_length, time_control): # para input_onset = a_to_b(input_onset) if not input_text and input_onset and is_tdc_format_valid(input_onset): input_text = convert_tdc_to_tcc(input_onset) elif not input_text: input_text = "a dog barks" if input_onset and not is_tdc_format_valid(input_onset): input_onset = "random" if time_control: if not input_onset or not input_length: input_json = json.loads(get_time_info(input_text)) input_onset, input_length = input_json["onset"], input_json["length"] else: input_onset = input_onset if input_onset else "random" input_length = input_length if input_length else "10.0" content = { "caption": input_text, "onset": input_onset, "length": input_length } with torch.no_grad(): waveform = model(content) output_wav = "output.wav" sf.write( output_wav, waveform[0, 0].cpu().numpy(), samplerate=24000, ) #input_onset = b_to_a(input_onset) return output_wav, str(input_onset) demo = gr.Interface( fn=infer, inputs=[ gr.Textbox(label="TCC (necessary)", value="a dog barks"), gr.Textbox(label="TDC (optional, see format)", value="a dog barks(3.0-4.0, 6.0-7.0)"), gr.Textbox(label="Length (seconds, optional)", value="10.0"), gr.Checkbox(label="Enable Time Control", value=False), ], outputs=[ gr.Audio(label="Generated Audio"), gr.Textbox(label="Final TDC Used (input_onset)") ], title="PicoAudio2 Online Inference", description=""" ## Definition **TCC (Temporal Coarse Caption):** A brief text description for the overall audio scene. *Example*: `a dog barks` **TDC (Temporal Detailed Caption):** A **caption with timestamp information** for each event. It allows precise temporal control over when events happen in the generated audio. *Example*: `a dog barks(1.0-2.0, 3.0-4.0); a man speaks(5.0-6.0)` --- ## Input Requirements & Format - **TCC** is **required** for audio generation. - **TDC** is **optional**. If provided, it should follow the format: `event1(start1-end1, start2-end2); event2(start1-end1, ...)` - **Length** (in seconds) is optional, but recommended for temporal control. The length defaults to 10.0 seconds. - **Enable Time Control**: Tick to use TDC and length for precise event timing. --- ## Notes - If TDC format is incorrect or length is missing, the model will generate audio **without precise temporal control**. - For general audio generation, it is recommended to input `random` for TDC. - You may leave TDC blank to let the LLM generate timestamps automatically (subject to API quota). --- """ ) if __name__ == "__main__": demo.launch()