T-one / config.json
sxdxfan's picture
Add model card, acoustic model checkpoint in safetensors format, ONNX model, tokenizer configs and KenLM model
e23fc14
{
"architectures": [
"ToneForCTC"
],
"ctc_loss_reduction": "mean",
"ctc_zero_infinity": true,
"decoder_params": {
"feat_in": 384,
"vocabulary": [
"а",
"б",
"в",
"г",
"д",
"е",
"ё",
"ж",
"з",
"и",
"й",
"к",
"л",
"м",
"н",
"о",
"п",
"р",
"с",
"т",
"у",
"ф",
"х",
"ц",
"ч",
"ш",
"щ",
"ъ",
"ы",
"ь",
"э",
"ю",
"я",
" "
]
},
"encoder_params": {
"chunk_size": 10,
"conv_kernel_size": 31,
"d_model": 384,
"dropout": 0.1,
"dropout_att": 0.1,
"feat_in": 64,
"ff_expansion_factor": 4,
"mhsa_state_size": 30,
"mhsa_stateless_layers": 14,
"n_heads": 8,
"n_layers": 16,
"reduction_factor": 2,
"reduction_kernel_size": 3,
"reduction_position": 6,
"rope_dim": 32,
"should_recompute_att_scores": [
true,
false,
false,
false,
false,
false,
false,
true,
false,
false,
false,
false,
false,
false,
true,
true
],
"subsampling_conv_channels": [
32,
64
],
"subsampling_kernel_size": [
[
11,
21
],
[
11,
11
]
],
"subsampling_strides": [
[
1,
1
],
[
3,
1
]
],
"upsample_position": 14
},
"feature_extraction_params": {
"n_fft": 160,
"n_mels": 64,
"preemphasis_coefficient": 0.97,
"sample_rate": 8000,
"window_size": 0.02,
"window_stride": 0.01
},
"pad_token_id": 34,
"torch_dtype": "float32",
"transformers_version": "4.41.2"
}