|
|
|
|
|
|
|
|
|
|
|
n_mels: 80 |
|
|
|
|
|
pretrained_path: poonehmousavi/discrete_wavlm_spk_rec_ecapatdn |
|
|
|
out_n_neurons: 1211 |
|
save_folder: tmp |
|
|
|
|
|
|
|
|
|
ssl_model_type: wavlm |
|
ssl_hub: microsoft/wavlm-large |
|
ssl_folder: !ref <save_folder>/ssl_checkpoint |
|
kmeans_repo_id: speechbrain/SSL_Quantization |
|
kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint |
|
kmeans_dataset: LibriSpeech-100-360-500 |
|
freeze_ssl: True |
|
freeze_feature_extractor: True |
|
num_clusters: 1000 |
|
|
|
|
|
|
|
|
|
|
|
|
|
ssl_layer_num: [1, 3, 7, 12, 18, 23] |
|
num_codebooks: 6 |
|
deduplicate: [False, False, False, False, False, False] |
|
bpe_tokenizer_path: [null, null, null, null, null, null] |
|
sample_rate: 16000 |
|
|
|
|
|
encoder_dim: 1024 |
|
|
|
tokenizer_config: |
|
SSL_layers: !ref <ssl_layer_num> |
|
deduplicates: !ref <deduplicate> |
|
bpe_tokenizers: !ref <bpe_tokenizer_path> |
|
|
|
ssl_model: !apply:speechbrain.utils.hparams.choice |
|
value: !ref <ssl_model_type> |
|
choices: |
|
wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM |
|
source: !ref <ssl_hub> |
|
output_norm: False |
|
freeze: !ref <freeze_ssl> |
|
freeze_feature_extractor: !ref <freeze_feature_extractor> |
|
output_all_hiddens: True |
|
save_path: !ref <ssl_folder> |
|
hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT |
|
source: !ref <ssl_hub> |
|
output_norm: False |
|
freeze: !ref <freeze_ssl> |
|
freeze_feature_extractor: !ref <freeze_feature_extractor> |
|
output_all_hiddens: True |
|
save_path: !ref <ssl_folder> |
|
wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 |
|
source: !ref <ssl_hub> |
|
output_norm: False |
|
freeze: !ref <freeze_ssl> |
|
freeze_feature_extractor: !ref <freeze_feature_extractor> |
|
output_all_hiddens: True |
|
save_path: !ref <ssl_folder> |
|
|
|
codec: !new:speechbrain.lobes.models.huggingface_transformers.discrete_ssl.DiscreteSSL |
|
save_path: !ref <kmeans_cache_dir> |
|
ssl_model: !ref <ssl_model> |
|
kmeans_dataset: !ref <kmeans_dataset> |
|
kmeans_repo_id: !ref <kmeans_repo_id> |
|
num_clusters: !ref <num_clusters> |
|
|
|
discrete_embedding_layer: !new:custom_interface.Discrete_EmbeddingLayer |
|
num_codebooks: !ref <num_codebooks> |
|
vocab_size: !ref <num_clusters> |
|
emb_dim: !ref <encoder_dim> |
|
|
|
attention_mlp: !new:custom_interface.AttentionMLP |
|
input_dim: !ref <encoder_dim> |
|
hidden_dim: !ref <encoder_dim> |
|
|
|
embedding_model: !new:speechbrain.lobes.models.ECAPA_TDNN.ECAPA_TDNN |
|
input_size: !ref <encoder_dim> |
|
channels: [1024, 1024, 1024, 1024, 3072] |
|
kernel_sizes: [5, 3, 3, 3, 1] |
|
dilations: [1, 2, 3, 4, 1] |
|
groups: [1, 1, 1, 1, 1] |
|
attention_channels: 128 |
|
lin_neurons: 192 |
|
|
|
classifier: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier |
|
input_size: 192 |
|
out_neurons: !ref <out_n_neurons> |
|
|
|
|
|
|
|
modules: |
|
embedding_model: !ref <embedding_model> |
|
classifier: !ref <classifier> |
|
attention_mlp: !ref <attention_mlp> |
|
codec: !ref <codec> |
|
discrete_embedding_layer: !ref <discrete_embedding_layer> |
|
|
|
|
|
label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder |
|
|
|
|
|
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer |
|
loadables: |
|
embedding_model: !ref <embedding_model> |
|
classifier: !ref <classifier> |
|
attention_mlp: !ref <attention_mlp> |
|
discrete_embedding_layer: !ref <discrete_embedding_layer> |
|
label_encoder: !ref <label_encoder> |
|
|
|
paths: |
|
embedding_model: !ref <pretrained_path>/embedding_model.ckpt |
|
classifier: !ref <pretrained_path>/classifier.ckpt |
|
attention_mlp: !ref <pretrained_path>/attention_mlp.ckpt |
|
label_encoder: !ref <pretrained_path>/label_encoder.txt |
|
discrete_embedding_layer: !ref <pretrained_path>/discrete_embedding_layer.ckpt |
|
|
|
|
|
|