{ "model" : { "fm_decoder_downsampling_factor" : [1,2,4,2,1], "fm_decoder_num_layers" : [2,2,4,4,4], "fm_decoder_cnn_module_kernel" : [31,15,7,15,31], "fm_decoder_feedforward_dim" : 1536, "fm_decoder_num_heads" : 4, "fm_decoder_dim" : 512, "text_encoder_num_layers" : 4, "text_encoder_feedforward_dim" : 512, "text_encoder_cnn_module_kernel" : 9, "text_encoder_num_heads" : 4, "text_encoder_dim" : 192, "query_head_dim" : 32, "value_head_dim" : 12, "pos_head_dim" : 4, "pos_dim" : 48, "time_embed_dim" : 192, "text_embed_dim" : 192, "feat_dim": 100 }, "feature" : { "sampling_rate": 24000, "type": "vocos" } }