File size: 1,635 Bytes
45e1a77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
{
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": true,
  "sampling_rate": 16000,
  "preprocessing": {
    "audio_normalization": {
      "method": "peak",
      "target_level": -23.0,
      "headroom_db": 3.0
    },
    "spectral_features": {
      "mel_filters": 128,
      "window_size_ms": 25,
      "stride_ms": 10,
      "fmin": 50,
      "fmax": 8000,
      "htk_compat": true
    },
    "augmentation": {
      "time_masking": {
        "enabled": true,
        "time_mask_param": 100,
        "num_masks": 2
      },
      "freq_masking": {
        "enabled": true,
        "freq_mask_param": 27,
        "num_masks": 2
      },
      "noise": {
        "enabled": true,
        "noise_types": ["gaussian", "pink"],
        "snr_range": [5, 20]
      }
    },
    "signal_enhancement": {
      "vad": {
        "enabled": true,
        "threshold": 0.5,
        "min_speech_duration_ms": 250
      },
      "noise_reduction": {
        "enabled": true,
        "method": "spectral_gating",
        "stationary_threshold": 1.5
      }
    }
  },
  "advanced_settings": {
    "feature_extraction": {
      "normalize_means": true,
      "normalize_vars": true,
      "deltas_order": 2,
      "cmvn_window": 300
    },
    "resampling": {
      "method": "kaiser_best",
      "lowpass_filter_width": 64,
      "rolloff": 0.945,
      "beta": 14.0
    },
    "performance": {
      "num_workers": 4,
      "pin_memory": true,
      "prefetch_factor": 2,
      "persistent_workers": true
    }
  }
}