| sample_rate: 24000 | |
| model: | |
| autoencoder: | |
| _target_: models.autoencoder.waveform.stable_vae.StableVAE | |
| encoder: | |
| _target_: models.autoencoder.waveform.stable_vae.OobleckEncoder | |
| in_channels: 1 | |
| channels: 128 | |
| c_mults: | |
| - 1 | |
| - 2 | |
| - 4 | |
| - 8 | |
| strides: | |
| - 2 | |
| - 4 | |
| - 6 | |
| - 10 | |
| latent_dim: 256 | |
| use_snake: true | |
| decoder: | |
| _target_: models.autoencoder.waveform.stable_vae.OobleckDecoder | |
| out_channels: 1 | |
| channels: 128 | |
| c_mults: | |
| - 1 | |
| - 2 | |
| - 4 | |
| - 8 | |
| strides: | |
| - 2 | |
| - 4 | |
| - 6 | |
| - 10 | |
| latent_dim: 128 | |
| use_snake: true | |
| final_tanh: false | |
| io_channels: 1 | |
| latent_dim: 128 | |
| downsampling_ratio: 480 | |
| sample_rate: 24000 | |
| pretrained_ckpt: vae/speech_audio_sound_step=1000000.ckpt | |
| bottleneck: | |
| _target_: models.autoencoder.waveform.stable_vae.VAEBottleneck | |
| backbone: | |
| _target_: models.dit.audio_dit.LayerFusionAudioDiT | |
| img_size: 1000 | |
| patch_size: 1 | |
| in_chans: 128 | |
| out_chans: 128 | |
| input_type: 1d | |
| embed_dim: 512 | |
| depth: 12 | |
| num_heads: 8 | |
| mlp_ratio: 4.0 | |
| qkv_bias: false | |
| qk_scale: null | |
| qk_norm: layernorm | |
| norm_layer: layernorm | |
| act_layer: geglu | |
| context_norm: true | |
| use_checkpoint: false | |
| time_fusion: ada | |
| ada_sola_rank: 32 | |
| ada_sola_alpha: 32 | |
| cls_dim: null | |
| ta_context_dim: 1024 | |
| ta_context_fusion: add | |
| ta_context_norm: true | |
| context_dim: 1024 | |
| context_fusion: cross | |
| context_max_length: null | |
| context_pe_method: none | |
| pe_method: none | |
| rope_mode: shared | |
| use_conv: true | |
| skip: true | |
| skip_norm: true | |
| content_adapter: | |
| _target_: models.content_adapter.CrossAttentionAdapter | |
| content_dim: 1024 | |
| d_out: 1024 | |
| prefix_dim: 1024 | |
| num_heads: 16 | |
| dropout: 0.2 | |
| duration_grad_scale: 0.1 | |
| duration_predictor: | |
| _target_: models.content_adapter.DurationPredictor | |
| in_channels: 1024 | |
| filter_channels: 512 | |
| n_layers: 5 | |
| kernel_size: 3 | |
| p_dropout: 0.5 | |
| content_dim: 1024 | |
| frame_resolution: 0.005 | |
| duration_offset: 1.0 | |
| cfg_drop_ratio: 0.2 | |
| _target_: models.flow_matching.DummyContentAudioFlowMatching | |
| content_encoder: | |
| _target_: models.content_encoder.content_encoder.ContentEncoder | |
| embed_dim: 1024 | |
| text_encoder: | |
| _target_: models.content_encoder.text_encoder.T5TextEncoder | |
| model_name: google/flan-t5-large | |
| embed_dim: 1024 | |
| midi_encoder: | |
| _target_: models.content_encoder.midi_encoder.FastSpeech2MIDIEncoder | |
| phone_vocab_size: 61 | |
| midi_vocab_size: 300 | |
| slur_vocab_size: 2 | |
| spk_config: | |
| _target_: models.content_encoder.midi_encoder.SpkConfig | |
| encoding_format: id | |
| num_spk: 20 | |
| d_model: 512 | |
| num_layers: 4 | |
| num_heads: 2 | |
| ffn_kernel_size: 9 | |
| d_out: 1024 | |
| audio_encoder: | |
| _target_: models.autoencoder.waveform.stable_vae.StableVAEProjectorWrapper | |
| vae_dim: 128 | |
| embed_dim: 1024 | |
| video_encoder: | |
| _target_: models.content_encoder.vision_encoder.MlpVideoEncoder | |
| video_feat_dim: 1024 | |
| embed_dim: 1024 | |
| phoneme_encoder: | |
| _target_: models.content_encoder.midi_encoder.FastSpeech2PhonemeEncoder | |
| phone_vocab_size: 92 | |
| d_model: 512 | |
| num_layers: 4 | |
| num_heads: 2 | |
| ffn_kernel_size: 9 | |
| d_out: 1024 | |
| spk_config: | |
| _target_: models.content_encoder.midi_encoder.SpkConfig | |
| encoding_format: embedding | |
| spk_embed_dim: 256 | |