{ "architectures": [ "ToneForCTC" ], "ctc_loss_reduction": "mean", "ctc_zero_infinity": true, "decoder_params": { "feat_in": 384, "vocabulary": [ "а", "б", "в", "г", "д", "е", "ё", "ж", "з", "и", "й", "к", "л", "м", "н", "о", "п", "р", "с", "т", "у", "ф", "х", "ц", "ч", "ш", "щ", "ъ", "ы", "ь", "э", "ю", "я", " " ] }, "encoder_params": { "chunk_size": 10, "conv_kernel_size": 31, "d_model": 384, "dropout": 0.1, "dropout_att": 0.1, "feat_in": 64, "ff_expansion_factor": 4, "mhsa_state_size": 30, "mhsa_stateless_layers": 14, "n_heads": 8, "n_layers": 16, "reduction_factor": 2, "reduction_kernel_size": 3, "reduction_position": 6, "rope_dim": 32, "should_recompute_att_scores": [ true, false, false, false, false, false, false, true, false, false, false, false, false, false, true, true ], "subsampling_conv_channels": [ 32, 64 ], "subsampling_kernel_size": [ [ 11, 21 ], [ 11, 11 ] ], "subsampling_strides": [ [ 1, 1 ], [ 3, 1 ] ], "upsample_position": 14 }, "feature_extraction_params": { "n_fft": 160, "n_mels": 64, "preemphasis_coefficient": 0.97, "sample_rate": 8000, "window_size": 0.02, "window_stride": 0.01 }, "pad_token_id": 34, "torch_dtype": "float32", "transformers_version": "4.41.2" }