pyf98 commited on
Commit
4f60adb
·
1 Parent(s): eb7e648
Files changed (37) hide show
  1. README.md +30 -0
  2. data/token_list/bpe_unigram50000/bpe.model +3 -0
  3. exp/s2t_stats_raw_bpe50000/train/feats_stats.npz +3 -0
  4. exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/RESULTS.md +9 -0
  5. exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/config.yaml +0 -0
  6. exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/backward_time.png +0 -0
  7. exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/cer_ctc.png +0 -0
  8. exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/cer_interctc_layer12.png +0 -0
  9. exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/cer_interctc_layer15.png +0 -0
  10. exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/cer_interctc_layer21.png +0 -0
  11. exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/cer_interctc_layer6.png +0 -0
  12. exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/clip.png +0 -0
  13. exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/forward_time.png +0 -0
  14. exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/gpu_max_cached_mem_GB.png +0 -0
  15. exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/grad_norm.png +0 -0
  16. exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/iter_time.png +0 -0
  17. exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/loss.png +0 -0
  18. exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/loss_ctc.png +0 -0
  19. exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/loss_interctc_layer12.png +0 -0
  20. exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/loss_interctc_layer15.png +0 -0
  21. exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/loss_interctc_layer21.png +0 -0
  22. exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/loss_interctc_layer6.png +0 -0
  23. exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/loss_scale.png +0 -0
  24. exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/optim0_lr0.png +0 -0
  25. exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/optim_step_time.png +0 -0
  26. exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/train_time.png +0 -0
  27. exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/train.1.log +0 -0
  28. exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/train.2.log +0 -0
  29. exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/train.3.log +0 -0
  30. exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/train.4.log +0 -0
  31. exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/train.5.log +0 -0
  32. exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/train.6.log +0 -0
  33. exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/train.7.log +0 -0
  34. exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/train.8.log +0 -0
  35. exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/train.log +0 -0
  36. exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/valid.total_count.ave_5best.till70epoch.pth +3 -0
  37. meta.yaml +8 -0
README.md ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - automatic-speech-recognition
6
+ - speech-translation
7
+ - language-identification
8
+ language: multilingual
9
+ datasets:
10
+ - owsm_ctc_v4
11
+ license: cc-by-4.0
12
+ metrics:
13
+ - cer
14
+ - bleu
15
+ - accuracy
16
+ library_name: espnet
17
+ ---
18
+
19
+ [OWSM-CTC](https://aclanthology.org/2024.acl-long.549/) (Peng et al., ACL 2024) is an encoder-only speech foundation model based on hierarchical multi-task self-conditioned CTC.
20
+ OWSM-CTC v4 is trained on 320k hours of public audio data for multilingual speech recognition, any-to-any speech translation, and language identification, which follows the design of the project, [Open Whisper-style Speech Model (OWSM)](https://arxiv.org/abs/2401.16658).
21
+
22
+ To use the pre-trained model, please install `espnet` and `espnet_model_zoo`. The requirements are:
23
+ ```
24
+ librosa
25
+ torch
26
+ espnet
27
+ espnet_model_zoo
28
+ ```
29
+
30
+ **Example usage can be found in ESPnet:** https://github.com/espnet/espnet/tree/master/egs2/owsm_ctc_v3.1/s2t1
data/token_list/bpe_unigram50000/bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71278ea470760e4a9c903242cabda668a846307761c8b947976113970a394795
3
+ size 1036703
exp/s2t_stats_raw_bpe50000/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00c22dba27594df8f1d8f74a491b20c6e6e8c17e92159f81dfd634f98c098654
3
+ size 1786
exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/RESULTS.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Generated by scripts/utils/show_asr_result.sh -->
2
+ # RESULTS
3
+ ## Environments
4
+ - date: `Wed Jan 15 18:41:49 CST 2025`
5
+ - python version: `3.11.8 | packaged by conda-forge | (main, Feb 16 2024, 20:38:00) [GCC 12.3.0]`
6
+ - espnet version: `espnet 202412`
7
+ - pytorch version: `pytorch 2.5.1`
8
+ - Git hash: `447443550139bec75838b5a6dc654373c21079a0`
9
+ - Commit date: `Sun Jan 12 17:15:11 2025 -0600`
exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/config.yaml ADDED
The diff for this file is too large to render. See raw diff
 
exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/backward_time.png ADDED
exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/cer_ctc.png ADDED
exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/cer_interctc_layer12.png ADDED
exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/cer_interctc_layer15.png ADDED
exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/cer_interctc_layer21.png ADDED
exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/cer_interctc_layer6.png ADDED
exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/clip.png ADDED
exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/forward_time.png ADDED
exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/gpu_max_cached_mem_GB.png ADDED
exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/grad_norm.png ADDED
exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/iter_time.png ADDED
exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/loss.png ADDED
exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/loss_ctc.png ADDED
exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/loss_interctc_layer12.png ADDED
exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/loss_interctc_layer15.png ADDED
exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/loss_interctc_layer21.png ADDED
exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/loss_interctc_layer6.png ADDED
exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/loss_scale.png ADDED
exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/optim0_lr0.png ADDED
exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/optim_step_time.png ADDED
exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/images/train_time.png ADDED
exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/train.1.log ADDED
The diff for this file is too large to render. See raw diff
 
exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/train.2.log ADDED
The diff for this file is too large to render. See raw diff
 
exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/train.3.log ADDED
The diff for this file is too large to render. See raw diff
 
exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/train.4.log ADDED
The diff for this file is too large to render. See raw diff
 
exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/train.5.log ADDED
The diff for this file is too large to render. See raw diff
 
exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/train.6.log ADDED
The diff for this file is too large to render. See raw diff
 
exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/train.7.log ADDED
The diff for this file is too large to render. See raw diff
 
exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/train.8.log ADDED
The diff for this file is too large to render. See raw diff
 
exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/train.log ADDED
The diff for this file is too large to render. See raw diff
 
exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/valid.total_count.ave_5best.till70epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10f28cd40ee38582f68131f6502f306762183bf0e85fdc238ea401bd0e863ae4
3
+ size 4045987222
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202310'
2
+ files:
3
+ s2t_model_file: exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/valid.total_count.ave_5best.till70epoch.pth
4
+ python: 3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:38:53) [GCC 12.3.0]
5
+ timestamp: 1737002738.816795
6
+ torch: 2.5.1
7
+ yaml_files:
8
+ s2t_train_config: exp/s2t_train_owsmctc_ebf27_conv2d8_size1024_mel128_bs320_raw_bpe50000/config.yaml