Johnson-Lsx commited on
Commit
a253dd6
1 Parent(s): a7372aa

Update model

Browse files
README.md ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - audio-to-audio
6
+ language: en
7
+ datasets:
8
+ - dns_ins20
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 ENH model
13
+
14
+ ### `Johnson-Lsx/Shaoxiong_Lin_dns_ins20_enh_enh_train_enh_dccrn_raw`
15
+
16
+ This model was trained by Shaoxiong Lin using dns_ins20 recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ ```bash
21
+ cd espnet
22
+ git checkout 4538462eb7dc6a6b858adcbd3a526fb8173d6f73
23
+ pip install -e .
24
+ cd egs2/dns_ins20/enh1
25
+ ./run.sh --skip_data_prep false --skip_train true --download_model Johnson-Lsx/Shaoxiong_Lin_dns_ins20_enh_enh_train_enh_dccrn_raw
26
+ ```
27
+
28
+ <!-- Generated by ./scripts/utils/show_enh_score.sh -->
29
+ # RESULTS
30
+ ## Environments
31
+ - date: `Thu Feb 10 23:11:40 CST 2022`
32
+ - python version: `3.8.12 (default, Oct 12 2021, 13:49:34) [GCC 7.5.0]`
33
+ - espnet version: `espnet 0.10.5a1`
34
+ - pytorch version: `pytorch 1.9.1`
35
+ - Git hash: `6f66283b9eed7b0d5e5643feb18d8f60118a4afc`
36
+ - Commit date: `Mon Dec 13 15:30:29 2021 +0800`
37
+
38
+
39
+ ## enh_train_enh_dccrn_batch_size_raw
40
+
41
+ config: ./conf/tuning/train_enh_dccrn_batch_size.yaml
42
+
43
+ |dataset|STOI|SAR|SDR|SIR|
44
+ |---|---|---|---|---|
45
+ |enhanced_cv_synthetic|0.98|24.69|24.69|0.00|
46
+ |enhanced_tt_synthetic_no_reverb|0.96|17.69|17.69|0.00|
47
+ |enhanced_tt_synthetic_with_reverb|0.81|10.45|10.45|0.00|
48
+
49
+ ## ENH config
50
+
51
+ <details><summary>expand</summary>
52
+
53
+ ```
54
+ config: ./conf/tuning/train_enh_dccrn_batch_size.yaml
55
+ print_config: false
56
+ log_level: INFO
57
+ dry_run: false
58
+ iterator_type: chunk
59
+ output_dir: exp/enh_train_enh_dccrn_batch_size_raw
60
+ ngpu: 1
61
+ seed: 0
62
+ num_workers: 4
63
+ num_att_plot: 3
64
+ dist_backend: nccl
65
+ dist_init_method: env://
66
+ dist_world_size: 4
67
+ dist_rank: 0
68
+ local_rank: 0
69
+ dist_master_addr: localhost
70
+ dist_master_port: 46366
71
+ dist_launcher: null
72
+ multiprocessing_distributed: true
73
+ unused_parameters: false
74
+ sharded_ddp: false
75
+ cudnn_enabled: true
76
+ cudnn_benchmark: false
77
+ cudnn_deterministic: true
78
+ collect_stats: false
79
+ write_collected_feats: false
80
+ max_epoch: 100
81
+ patience: 10
82
+ val_scheduler_criterion:
83
+ - valid
84
+ - loss
85
+ early_stopping_criterion:
86
+ - valid
87
+ - loss
88
+ - min
89
+ best_model_criterion:
90
+ - - valid
91
+ - si_snr
92
+ - max
93
+ - - valid
94
+ - loss
95
+ - min
96
+ keep_nbest_models: 1
97
+ nbest_averaging_interval: 0
98
+ grad_clip: 5.0
99
+ grad_clip_type: 2.0
100
+ grad_noise: false
101
+ accum_grad: 1
102
+ no_forward_run: false
103
+ resume: true
104
+ train_dtype: float32
105
+ use_amp: false
106
+ log_interval: null
107
+ use_tensorboard: true
108
+ use_wandb: false
109
+ wandb_project: null
110
+ wandb_id: null
111
+ wandb_entity: null
112
+ wandb_name: null
113
+ wandb_model_log_interval: -1
114
+ detect_anomaly: false
115
+ pretrain_path: null
116
+ init_param: []
117
+ ignore_init_mismatch: false
118
+ freeze_param: []
119
+ num_iters_per_epoch: null
120
+ batch_size: 32
121
+ valid_batch_size: null
122
+ batch_bins: 1000000
123
+ valid_batch_bins: null
124
+ train_shape_file:
125
+ - exp/enh_stats_16k/train/speech_mix_shape
126
+ - exp/enh_stats_16k/train/speech_ref1_shape
127
+ - exp/enh_stats_16k/train/noise_ref1_shape
128
+ valid_shape_file:
129
+ - exp/enh_stats_16k/valid/speech_mix_shape
130
+ - exp/enh_stats_16k/valid/speech_ref1_shape
131
+ - exp/enh_stats_16k/valid/noise_ref1_shape
132
+ batch_type: folded
133
+ valid_batch_type: null
134
+ fold_length:
135
+ - 80000
136
+ - 80000
137
+ - 80000
138
+ sort_in_batch: descending
139
+ sort_batch: descending
140
+ multiple_iterator: false
141
+ chunk_length: 64000
142
+ chunk_shift_ratio: 0.5
143
+ num_cache_chunks: 1024
144
+ train_data_path_and_name_and_type:
145
+ - - dump/raw/tr_synthetic/wav.scp
146
+ - speech_mix
147
+ - sound
148
+ - - dump/raw/tr_synthetic/spk1.scp
149
+ - speech_ref1
150
+ - sound
151
+ - - dump/raw/tr_synthetic/noise1.scp
152
+ - noise_ref1
153
+ - sound
154
+ valid_data_path_and_name_and_type:
155
+ - - dump/raw/cv_synthetic/wav.scp
156
+ - speech_mix
157
+ - sound
158
+ - - dump/raw/cv_synthetic/spk1.scp
159
+ - speech_ref1
160
+ - sound
161
+ - - dump/raw/cv_synthetic/noise1.scp
162
+ - noise_ref1
163
+ - sound
164
+ allow_variable_data_keys: false
165
+ max_cache_size: 0.0
166
+ max_cache_fd: 32
167
+ valid_max_cache_size: null
168
+ optim: adam
169
+ optim_conf:
170
+ lr: 0.001
171
+ eps: 1.0e-08
172
+ weight_decay: 1.0e-07
173
+ scheduler: reducelronplateau
174
+ scheduler_conf:
175
+ mode: min
176
+ factor: 0.7
177
+ patience: 1
178
+ init: null
179
+ model_conf:
180
+ loss_type: si_snr
181
+ criterions:
182
+ # The first criterion
183
+ - name: si_snr
184
+ conf:
185
+ eps: 1.0e-7
186
+ # the wrapper for the current criterion
187
+ # for single-talker case, we simplely use fixed_order wrapper
188
+ wrapper: fixed_order
189
+ wrapper_conf:
190
+ weight: 1.0
191
+ use_preprocessor: false
192
+ encoder: stft
193
+ encoder_conf:
194
+ n_fft: 512
195
+ win_length: 400
196
+ hop_length: 100
197
+ separator: dccrn
198
+ separator_conf: {}
199
+ decoder: stft
200
+ decoder_conf:
201
+ n_fft: 512
202
+ win_length: 400
203
+ hop_length: 100
204
+ required:
205
+ - output_dir
206
+ version: 0.10.5a1
207
+ distributed: true
208
+ ```
209
+
210
+ </details>
211
+
212
+
213
+
214
+ ### Citing ESPnet
215
+
216
+ ```BibTex
217
+ @inproceedings{watanabe2018espnet,
218
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
219
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
220
+ year={2018},
221
+ booktitle={Proceedings of Interspeech},
222
+ pages={2207--2211},
223
+ doi={10.21437/Interspeech.2018-1456},
224
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
225
+ }
226
+
227
+
228
+ @inproceedings{ESPnet-SE,
229
+ author = {Chenda Li and Jing Shi and Wangyou Zhang and Aswin Shanmugam Subramanian and Xuankai Chang and
230
+ Naoyuki Kamo and Moto Hira and Tomoki Hayashi and Christoph B{"{o}}ddeker and Zhuo Chen and Shinji Watanabe},
231
+ title = {ESPnet-SE: End-To-End Speech Enhancement and Separation Toolkit Designed for {ASR} Integration},
232
+ booktitle = {{IEEE} Spoken Language Technology Workshop, {SLT} 2021, Shenzhen, China, January 19-22, 2021},
233
+ pages = {785--792},
234
+ publisher = {{IEEE}},
235
+ year = {2021},
236
+ url = {https://doi.org/10.1109/SLT48900.2021.9383615},
237
+ doi = {10.1109/SLT48900.2021.9383615},
238
+ timestamp = {Mon, 12 Apr 2021 17:08:59 +0200},
239
+ biburl = {https://dblp.org/rec/conf/slt/Li0ZSCKHHBC021.bib},
240
+ bibsource = {dblp computer science bibliography, https://dblp.org}
241
+ }
242
+
243
+
244
+ ```
245
+
246
+ or arXiv:
247
+
248
+ ```bibtex
249
+ @misc{watanabe2018espnet,
250
+ title={ESPnet: End-to-End Speech Processing Toolkit},
251
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
252
+ year={2018},
253
+ eprint={1804.00015},
254
+ archivePrefix={arXiv},
255
+ primaryClass={cs.CL}
256
+ }
257
+ ```
exp/enh_stats_16k/train/feats_stats.npz ADDED
Binary file (778 Bytes). View file
 
exp/enh_train_enh_dccrn_batch_size_raw/48epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbb0b8e7643ca5fd70afb293f74d26ebd0fcae2f36df40b649f622371636a17c
3
+ size 14742669
exp/enh_train_enh_dccrn_batch_size_raw/RESULTS.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Generated by ./scripts/utils/show_enh_score.sh -->
2
+ # RESULTS
3
+ ## Environments
4
+ - date: `Thu Feb 10 23:11:40 CST 2022`
5
+ - python version: `3.8.12 (default, Oct 12 2021, 13:49:34) [GCC 7.5.0]`
6
+ - espnet version: `espnet 0.10.5a1`
7
+ - pytorch version: `pytorch 1.9.1`
8
+ - Git hash: `6f66283b9eed7b0d5e5643feb18d8f60118a4afc`
9
+ - Commit date: `Mon Dec 13 15:30:29 2021 +0800`
10
+
11
+
12
+ ## enh_train_enh_dccrn_batch_size_raw
13
+
14
+ config: ./conf/tuning/train_enh_dccrn_batch_size.yaml
15
+
16
+ |dataset|STOI|SAR|SDR|SIR|
17
+ |---|---|---|---|---|
18
+ |enhanced_cv_synthetic|0.98|24.69|24.69|0.00|
19
+ |enhanced_tt_synthetic_no_reverb|0.96|17.69|17.69|0.00|
20
+ |enhanced_tt_synthetic_with_reverb|0.81|10.45|10.45|0.00|
21
+
exp/enh_train_enh_dccrn_batch_size_raw/config.yaml ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: ./conf/tuning/train_enh_dccrn_batch_size.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: chunk
6
+ output_dir: exp/enh_train_enh_dccrn_batch_size_raw
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 4
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 4
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 46366
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 100
28
+ patience: 10
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - si_snr
39
+ - max
40
+ - - valid
41
+ - loss
42
+ - min
43
+ keep_nbest_models: 1
44
+ nbest_averaging_interval: 0
45
+ grad_clip: 5.0
46
+ grad_clip_type: 2.0
47
+ grad_noise: false
48
+ accum_grad: 1
49
+ no_forward_run: false
50
+ resume: true
51
+ train_dtype: float32
52
+ use_amp: false
53
+ log_interval: null
54
+ use_tensorboard: true
55
+ use_wandb: false
56
+ wandb_project: null
57
+ wandb_id: null
58
+ wandb_entity: null
59
+ wandb_name: null
60
+ wandb_model_log_interval: -1
61
+ detect_anomaly: false
62
+ pretrain_path: null
63
+ init_param: []
64
+ ignore_init_mismatch: false
65
+ freeze_param: []
66
+ num_iters_per_epoch: null
67
+ batch_size: 32
68
+ valid_batch_size: null
69
+ batch_bins: 1000000
70
+ valid_batch_bins: null
71
+ train_shape_file:
72
+ - exp/enh_stats_16k/train/speech_mix_shape
73
+ - exp/enh_stats_16k/train/speech_ref1_shape
74
+ - exp/enh_stats_16k/train/noise_ref1_shape
75
+ valid_shape_file:
76
+ - exp/enh_stats_16k/valid/speech_mix_shape
77
+ - exp/enh_stats_16k/valid/speech_ref1_shape
78
+ - exp/enh_stats_16k/valid/noise_ref1_shape
79
+ batch_type: folded
80
+ valid_batch_type: null
81
+ fold_length:
82
+ - 80000
83
+ - 80000
84
+ - 80000
85
+ sort_in_batch: descending
86
+ sort_batch: descending
87
+ multiple_iterator: false
88
+ chunk_length: 64000
89
+ chunk_shift_ratio: 0.5
90
+ num_cache_chunks: 1024
91
+ train_data_path_and_name_and_type:
92
+ - - dump/raw/tr_synthetic/wav.scp
93
+ - speech_mix
94
+ - sound
95
+ - - dump/raw/tr_synthetic/spk1.scp
96
+ - speech_ref1
97
+ - sound
98
+ - - dump/raw/tr_synthetic/noise1.scp
99
+ - noise_ref1
100
+ - sound
101
+ valid_data_path_and_name_and_type:
102
+ - - dump/raw/cv_synthetic/wav.scp
103
+ - speech_mix
104
+ - sound
105
+ - - dump/raw/cv_synthetic/spk1.scp
106
+ - speech_ref1
107
+ - sound
108
+ - - dump/raw/cv_synthetic/noise1.scp
109
+ - noise_ref1
110
+ - sound
111
+ allow_variable_data_keys: false
112
+ max_cache_size: 0.0
113
+ max_cache_fd: 32
114
+ valid_max_cache_size: null
115
+ optim: adam
116
+ optim_conf:
117
+ lr: 0.001
118
+ eps: 1.0e-08
119
+ weight_decay: 1.0e-07
120
+ scheduler: reducelronplateau
121
+ scheduler_conf:
122
+ mode: min
123
+ factor: 0.7
124
+ patience: 1
125
+ init: null
126
+ model_conf:
127
+ loss_type: si_snr
128
+ criterions:
129
+ # The first criterion
130
+ - name: si_snr
131
+ conf:
132
+ eps: 1.0e-7
133
+ # the wrapper for the current criterion
134
+ # for single-talker case, we simplely use fixed_order wrapper
135
+ wrapper: fixed_order
136
+ wrapper_conf:
137
+ weight: 1.0
138
+ use_preprocessor: false
139
+ encoder: stft
140
+ encoder_conf:
141
+ n_fft: 512
142
+ win_length: 400
143
+ hop_length: 100
144
+ separator: dccrn
145
+ separator_conf: {}
146
+ decoder: stft
147
+ decoder_conf:
148
+ n_fft: 512
149
+ win_length: 400
150
+ hop_length: 100
151
+ required:
152
+ - output_dir
153
+ version: 0.10.5a1
154
+ distributed: true
exp/enh_train_enh_dccrn_batch_size_raw/images/backward_time.png ADDED
exp/enh_train_enh_dccrn_batch_size_raw/images/forward_time.png ADDED
exp/enh_train_enh_dccrn_batch_size_raw/images/gpu_max_cached_mem_GB.png ADDED
exp/enh_train_enh_dccrn_batch_size_raw/images/iter_time.png ADDED
exp/enh_train_enh_dccrn_batch_size_raw/images/loss.png ADDED
exp/enh_train_enh_dccrn_batch_size_raw/images/optim0_lr0.png ADDED
exp/enh_train_enh_dccrn_batch_size_raw/images/optim_step_time.png ADDED
exp/enh_train_enh_dccrn_batch_size_raw/images/si_snr.png ADDED
exp/enh_train_enh_dccrn_batch_size_raw/images/train_time.png ADDED
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: 0.10.7a1
2
+ files:
3
+ model_file: exp/enh_train_enh_dccrn_batch_size_raw/48epoch.pth
4
+ python: "3.8.12 (default, Oct 12 2021, 13:49:34) \n[GCC 7.5.0]"
5
+ timestamp: 1646303990.064911
6
+ torch: 1.9.1
7
+ yaml_files:
8
+ train_config: exp/enh_train_enh_dccrn_batch_size_raw/config.yaml