torch torchaudio soundfile transformers bitsandbytes>0.37.0 vinorm cached_path huggingface_hub gradio accelerate>=0.33.0 click datasets ema_pytorch>=0.5.2 gradio>=3.45.2 hydra-core>=1.3.0 jieba librosa matplotlib numpy<=1.26.4 pydub pypinyin safetensors soundfile tomli torchdiffeq tqdm>=4.65.0 transformers_stream_generator vocos wandb x_transformers>=1.31.14