{ | |
"model" : { | |
"fm_decoder_downsampling_factor" : [1,2,4,2,1], | |
"fm_decoder_num_layers" : [2,2,4,4,4], | |
"fm_decoder_cnn_module_kernel" : [31,15,7,15,31], | |
"fm_decoder_feedforward_dim" : 1536, | |
"fm_decoder_num_heads" : 4, | |
"fm_decoder_dim" : 512, | |
"text_encoder_num_layers" : 4, | |
"text_encoder_feedforward_dim" : 512, | |
"text_encoder_cnn_module_kernel" : 9, | |
"text_encoder_num_heads" : 4, | |
"text_encoder_dim" : 192, | |
"query_head_dim" : 32, | |
"value_head_dim" : 12, | |
"pos_head_dim" : 4, | |
"pos_dim" : 48, | |
"time_embed_dim" : 192, | |
"text_embed_dim" : 192, | |
"feat_dim": 100 | |
}, | |
"feature" : { | |
"sampling_rate": 24000, | |
"type": "vocos" | |
} | |
} |