UniFlow-Audio-small / config.yaml
wsntxxn's picture
Upload folder using huggingface_hub
ba849c2 verified
sample_rate: 24000
model:
autoencoder:
_target_: models.autoencoder.waveform.stable_vae.StableVAE
encoder:
_target_: models.autoencoder.waveform.stable_vae.OobleckEncoder
in_channels: 1
channels: 128
c_mults:
- 1
- 2
- 4
- 8
strides:
- 2
- 4
- 6
- 10
latent_dim: 256
use_snake: true
decoder:
_target_: models.autoencoder.waveform.stable_vae.OobleckDecoder
out_channels: 1
channels: 128
c_mults:
- 1
- 2
- 4
- 8
strides:
- 2
- 4
- 6
- 10
latent_dim: 128
use_snake: true
final_tanh: false
io_channels: 1
latent_dim: 128
downsampling_ratio: 480
sample_rate: 24000
pretrained_ckpt: vae/speech_audio_sound_step=1000000.ckpt
bottleneck:
_target_: models.autoencoder.waveform.stable_vae.VAEBottleneck
backbone:
_target_: models.dit.audio_dit.LayerFusionAudioDiT
img_size: 1000
patch_size: 1
in_chans: 128
out_chans: 128
input_type: 1d
embed_dim: 512
depth: 12
num_heads: 8
mlp_ratio: 4.0
qkv_bias: false
qk_scale: null
qk_norm: layernorm
norm_layer: layernorm
act_layer: geglu
context_norm: true
use_checkpoint: false
time_fusion: ada
ada_sola_rank: 32
ada_sola_alpha: 32
cls_dim: null
ta_context_dim: 1024
ta_context_fusion: add
ta_context_norm: true
context_dim: 1024
context_fusion: cross
context_max_length: null
context_pe_method: none
pe_method: none
rope_mode: shared
use_conv: true
skip: true
skip_norm: true
content_adapter:
_target_: models.content_adapter.CrossAttentionAdapter
content_dim: 1024
d_out: 1024
prefix_dim: 1024
num_heads: 16
dropout: 0.2
duration_grad_scale: 0.1
duration_predictor:
_target_: models.content_adapter.DurationPredictor
in_channels: 1024
filter_channels: 512
n_layers: 5
kernel_size: 3
p_dropout: 0.5
content_dim: 1024
frame_resolution: 0.005
duration_offset: 1.0
cfg_drop_ratio: 0.2
_target_: models.flow_matching.DummyContentAudioFlowMatching
content_encoder:
_target_: models.content_encoder.content_encoder.ContentEncoder
embed_dim: 1024
text_encoder:
_target_: models.content_encoder.text_encoder.T5TextEncoder
model_name: google/flan-t5-large
embed_dim: 1024
midi_encoder:
_target_: models.content_encoder.midi_encoder.FastSpeech2MIDIEncoder
phone_vocab_size: 61
midi_vocab_size: 300
slur_vocab_size: 2
spk_config:
_target_: models.content_encoder.midi_encoder.SpkConfig
encoding_format: id
num_spk: 20
d_model: 512
num_layers: 4
num_heads: 2
ffn_kernel_size: 9
d_out: 1024
audio_encoder:
_target_: models.autoencoder.waveform.stable_vae.StableVAEProjectorWrapper
vae_dim: 128
embed_dim: 1024
video_encoder:
_target_: models.content_encoder.vision_encoder.MlpVideoEncoder
video_feat_dim: 1024
embed_dim: 1024
phoneme_encoder:
_target_: models.content_encoder.midi_encoder.FastSpeech2PhonemeEncoder
phone_vocab_size: 92
d_model: 512
num_layers: 4
num_heads: 2
ffn_kernel_size: 9
d_out: 1024
spk_config:
_target_: models.content_encoder.midi_encoder.SpkConfig
encoding_format: embedding
spk_embed_dim: 256