saurabhk0322 commited on
Commit
fb707a3
·
1 Parent(s): 8c887ec

Update model

Browse files
Files changed (23) hide show
  1. README.md +612 -0
  2. data_respin/data_asru25/nlsyms.txt +8 -0
  3. exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/RESULTS.md +27 -0
  4. exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/config.yaml +511 -0
  5. exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/acc.png +0 -0
  6. exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/backward_time.png +0 -0
  7. exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/cer.png +0 -0
  8. exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/cer_ctc.png +0 -0
  9. exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/clip.png +0 -0
  10. exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/forward_time.png +0 -0
  11. exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/gpu_max_cached_mem_GB.png +0 -0
  12. exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/grad_norm.png +0 -0
  13. exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/iter_time.png +0 -0
  14. exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/loss.png +0 -0
  15. exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/loss_att.png +0 -0
  16. exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/loss_ctc.png +0 -0
  17. exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/loss_scale.png +0 -0
  18. exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/optim0_lr0.png +0 -0
  19. exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/optim_step_time.png +0 -0
  20. exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/train_time.png +0 -0
  21. exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/wer.png +0 -0
  22. exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/valid.acc.ave_5best.pth +3 -0
  23. meta.yaml +8 -0
README.md ADDED
@@ -0,0 +1,612 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - automatic-speech-recognition
6
+ language: multilingual
7
+ datasets:
8
+ - respin_asru25
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 ASR model
13
+
14
+ ### `saurabhk0322/respin_asru25_track3`
15
+
16
+ This model was trained by wtc15 using respin_asru25 recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+
26
+ pip install -e .
27
+ cd egs2/respin_asru25/asr1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model saurabhk0322/respin_asru25_track3
29
+ ```
30
+
31
+ <!-- Generated by scripts/utils/show_asr_result.sh -->
32
+ # RESULTS
33
+ ## Environments
34
+ - date: `Wed Apr 23 00:04:53 IST 2025`
35
+ - python version: `3.10.12 (main, Feb 4 2025, 14:57:36) [GCC 11.4.0]`
36
+ - espnet version: `espnet 202503`
37
+ - pytorch version: `pytorch 2.3.0+cu121`
38
+ - Git hash: `0cc9d62673c1461efe37632aeab297a311bcd7f0`
39
+ - Commit date: `Sat Apr 12 17:53:44 2025 -0400`
40
+
41
+ ## exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/decode_lid_asr_model_valid.acc.ave
42
+ ### WER
43
+
44
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
45
+ |---|---|---|---|---|---|---|---|---|
46
+ |org/dev_lid|11507|111296|85.3|13.6|1.1|1.0|15.7|67.6|
47
+
48
+ ### CER
49
+
50
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
51
+ |---|---|---|---|---|---|---|---|---|
52
+ |org/dev_lid|11507|615241|97.3|1.6|1.0|0.9|3.5|67.6|
53
+
54
+ ### TER
55
+
56
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
57
+ |---|---|---|---|---|---|---|---|---|
58
+
59
+ ## ASR config
60
+
61
+ <details><summary>expand</summary>
62
+
63
+ ```
64
+ config: conf/tuning/train_asr_conformer_transformer_e8_linear1024_bs6M_gacc1.yaml
65
+ print_config: false
66
+ log_level: INFO
67
+ drop_last_iter: false
68
+ dry_run: false
69
+ iterator_type: sequence
70
+ valid_iterator_type: null
71
+ output_dir: exp/asru25/exp_large_lid_indic_char/asr_noaux_con_e8_lin1024_bs6M_gacc1_ctc03_lid
72
+ ngpu: 1
73
+ seed: 2022
74
+ num_workers: 8
75
+ num_att_plot: 3
76
+ dist_backend: nccl
77
+ dist_init_method: env://
78
+ dist_world_size: null
79
+ dist_rank: null
80
+ local_rank: 0
81
+ dist_master_addr: null
82
+ dist_master_port: null
83
+ dist_launcher: null
84
+ multiprocessing_distributed: false
85
+ unused_parameters: false
86
+ sharded_ddp: false
87
+ use_deepspeed: false
88
+ deepspeed_config: null
89
+ gradient_as_bucket_view: true
90
+ ddp_comm_hook: null
91
+ cudnn_enabled: true
92
+ cudnn_benchmark: false
93
+ cudnn_deterministic: true
94
+ use_tf32: false
95
+ collect_stats: false
96
+ write_collected_feats: false
97
+ max_epoch: 70
98
+ patience: 5
99
+ val_scheduler_criterion:
100
+ - valid
101
+ - loss
102
+ early_stopping_criterion:
103
+ - valid
104
+ - loss
105
+ - min
106
+ best_model_criterion:
107
+ - - valid
108
+ - acc
109
+ - max
110
+ keep_nbest_models: 5
111
+ nbest_averaging_interval: 0
112
+ grad_clip: 5.0
113
+ grad_clip_type: 2.0
114
+ grad_noise: false
115
+ accum_grad: 1
116
+ no_forward_run: false
117
+ resume: true
118
+ train_dtype: float32
119
+ use_amp: true
120
+ log_interval: null
121
+ use_matplotlib: true
122
+ use_tensorboard: true
123
+ create_graph_in_tensorboard: false
124
+ use_wandb: false
125
+ wandb_project: null
126
+ wandb_id: null
127
+ wandb_entity: null
128
+ wandb_name: null
129
+ wandb_model_log_interval: -1
130
+ detect_anomaly: false
131
+ use_adapter: false
132
+ adapter: lora
133
+ save_strategy: all
134
+ adapter_conf: {}
135
+ pretrain_path: null
136
+ init_param: []
137
+ ignore_init_mismatch: false
138
+ freeze_param: []
139
+ num_iters_per_epoch: null
140
+ batch_size: 20
141
+ valid_batch_size: null
142
+ batch_bins: 6000000
143
+ valid_batch_bins: null
144
+ category_sample_size: 10
145
+ train_shape_file:
146
+ - exp/asru25/exp_large_lid_indic_char/asr_stats_raw_large_lid_char_sp/train/speech_shape
147
+ - exp/asru25/exp_large_lid_indic_char/asr_stats_raw_large_lid_char_sp/train/text_shape.char
148
+ valid_shape_file:
149
+ - exp/asru25/exp_large_lid_indic_char/asr_stats_raw_large_lid_char_sp/valid/speech_shape
150
+ - exp/asru25/exp_large_lid_indic_char/asr_stats_raw_large_lid_char_sp/valid/text_shape.char
151
+ batch_type: numel
152
+ valid_batch_type: null
153
+ fold_length:
154
+ - 80000
155
+ - 150
156
+ sort_in_batch: descending
157
+ shuffle_within_batch: false
158
+ sort_batch: descending
159
+ multiple_iterator: false
160
+ chunk_length: 500
161
+ chunk_shift_ratio: 0.5
162
+ num_cache_chunks: 1024
163
+ chunk_excluded_key_prefixes: []
164
+ chunk_default_fs: null
165
+ chunk_max_abs_length: null
166
+ chunk_discard_short_samples: true
167
+ train_data_path_and_name_and_type:
168
+ - - dump/asru25_lid/raw/train_large_lid_sp/wav.scp
169
+ - speech
170
+ - sound
171
+ - - dump/asru25_lid/raw/train_large_lid_sp/text
172
+ - text
173
+ - text
174
+ valid_data_path_and_name_and_type:
175
+ - - dump/asru25_lid/raw/dev_lid/wav.scp
176
+ - speech
177
+ - sound
178
+ - - dump/asru25_lid/raw/dev_lid/text
179
+ - text
180
+ - text
181
+ multi_task_dataset: false
182
+ allow_variable_data_keys: false
183
+ max_cache_size: 0.0
184
+ max_cache_fd: 32
185
+ allow_multi_rates: false
186
+ valid_max_cache_size: null
187
+ exclude_weight_decay: false
188
+ exclude_weight_decay_conf: {}
189
+ optim: adam
190
+ optim_conf:
191
+ lr: 0.002
192
+ weight_decay: 1.0e-06
193
+ scheduler: warmuplr
194
+ scheduler_conf:
195
+ warmup_steps: 15000
196
+ token_list:
197
+ - <blank>
198
+ - <unk>
199
+ - <space>
200
+ - ा
201
+ - क
202
+ - े
203
+ - र
204
+ - ल
205
+ - न
206
+ - स
207
+ - ्
208
+ - त
209
+ - '['
210
+ - ']'
211
+ - म
212
+ - ी
213
+ - ि
214
+ - ್
215
+ - ह
216
+ - य
217
+ - ब
218
+ - प
219
+ - ो
220
+ - ్
221
+ - া
222
+ - व
223
+ - ज
224
+ - ं
225
+ - ు
226
+ - ��
227
+ - ి
228
+ - ಿ
229
+ - ে
230
+ - ಾ
231
+ - द
232
+ - র
233
+ - న
234
+ - ల
235
+ - ग
236
+ - ರ
237
+ - m
238
+ - ం
239
+ - క
240
+ - ক
241
+ - ర
242
+ - ು
243
+ - ನ
244
+ - ্
245
+ - ಕ
246
+ - ु
247
+ - च
248
+ - ट
249
+ - ತ
250
+ - ै
251
+ - ದ
252
+ - ি
253
+ - ख
254
+ - ೆ
255
+ - ಗ
256
+ - t
257
+ - ప
258
+ - త
259
+ - b
260
+ - h
261
+ - इ
262
+ - अ
263
+ - n
264
+ - आ
265
+ - ू
266
+ - ন
267
+ - ಸ
268
+ - ಲ
269
+ - వ
270
+ - ట
271
+ - స
272
+ - ವ
273
+ - ే
274
+ - छ
275
+ - ব
276
+ - ল
277
+ - ಯ
278
+ - भ
279
+ - श
280
+ - ಂ
281
+ - ಮ
282
+ - య
283
+ - য
284
+ - ध
285
+ - డ
286
+ - ద
287
+ - ए
288
+ - थ
289
+ - మ
290
+ - ಬ
291
+ - చ
292
+ - ण
293
+ - ड
294
+ - ম
295
+ - ई
296
+ - उ
297
+ - স
298
+ - ত
299
+ - ಳ
300
+ - ో
301
+ - ಡ
302
+ - फ
303
+ - g
304
+ - r
305
+ - e
306
+ - గ
307
+ - প
308
+ - ট
309
+ - য়
310
+ - c
311
+ - k
312
+ - ಟ
313
+ - ె
314
+ - .
315
+ - ಹ
316
+ - ಪ
317
+ - ೇ
318
+ - బ
319
+ - হ
320
+ - ু
321
+ - দ
322
+ - ष
323
+ - ো
324
+ - ీ
325
+ - জ
326
+ - ड़
327
+ - ಅ
328
+ - ೊ
329
+ - ই
330
+ - গ
331
+ - అ
332
+ - घ
333
+ - ಣ
334
+ - ठ
335
+ - ೋ
336
+ - চ
337
+ - ँ
338
+ - ొ
339
+ - ौ
340
+ - ছ
341
+ - ఎ
342
+ - ओ
343
+ - শ
344
+ - আ
345
+ - ూ
346
+ - జ
347
+ - ಜ
348
+ - থ
349
+ - ভ
350
+ - ಇ
351
+ - ೂ
352
+ - ಷ
353
+ - ಚ
354
+ - এ
355
+ - ষ
356
+ - ై
357
+ - ೀ
358
+ - ఉ
359
+ - ಎ
360
+ - ಆ
361
+ - ळ
362
+ - ধ
363
+ - ृ
364
+ - ী
365
+ - উ
366
+ - ফ
367
+ - খ
368
+ - ড
369
+ - ॉ
370
+ - ಶ
371
+ - অ
372
+ - ೈ
373
+ - ధ
374
+ - ং
375
+ - झ
376
+ - ఇ
377
+ - ఆ
378
+ - ష
379
+ - ढ
380
+ - ढ़
381
+ - భ
382
+ - శ
383
+ - ఏ
384
+ - ಧ
385
+ - ও
386
+ - ಒ
387
+ - ಭ
388
+ - ళ
389
+ - ಉ
390
+ - ॅ
391
+ - ಫ
392
+ - ऊ
393
+ - ఫ
394
+ - ಥ
395
+ - ऑ
396
+ - ణ
397
+ - ড়
398
+ - ণ
399
+ - ঙ
400
+ - ऋ
401
+ - ಖ
402
+ - ऽ
403
+ - హ
404
+ - థ
405
+ - औ
406
+ - ೃ
407
+ - ঁ
408
+ - ೌ
409
+ - ఒ
410
+ - ఖ
411
+ - ৃ
412
+ - ఈ
413
+ - ಏ
414
+ - ঠ
415
+ - ౌ
416
+ - ಐ
417
+ - ৈ
418
+ - ऐ
419
+ - ऱ
420
+ - ఐ
421
+ - ূ
422
+ - ञ
423
+ - ৎ
424
+ - ృ
425
+ - ज़
426
+ - ঞ
427
+ - ಈ
428
+ - ঘ
429
+ - ঋ
430
+ - ঝ
431
+ - फ़
432
+ - ৌ
433
+ - ಠ
434
+ - ः
435
+ - ఓ
436
+ - ಘ
437
+ - ಛ
438
+ - ಓ
439
+ - ఊ
440
+ - ఋ
441
+ - ಔ
442
+ - ఛ
443
+ - ಞ
444
+ - ॲ
445
+ - ಊ
446
+ - ఘ
447
+ - ঢ
448
+ - ख़
449
+ - ়
450
+ - ऍ
451
+ - ಋ
452
+ - क़
453
+ - ఠ
454
+ - ঢ়
455
+ - ঃ
456
+ - ़
457
+ - ೕ
458
+ - ఔ
459
+ - ಢ
460
+ - ঊ
461
+ - ఱ
462
+ - ಃ
463
+ - ಝ
464
+ - ङ
465
+ - ఢ
466
+ - ग़
467
+ - ఞ
468
+ - ঐ
469
+ - ঔ
470
+ - ॠ
471
+ - ':'
472
+ - ೯
473
+ - ೖ
474
+ - ঈ
475
+ - ః
476
+ - ౖ
477
+ - ৠ
478
+ - ౦
479
+ - <sos/eos>
480
+ init: null
481
+ input_size: null
482
+ ctc_conf:
483
+ dropout_rate: 0.0
484
+ ctc_type: builtin
485
+ reduce: true
486
+ ignore_nan_grad: null
487
+ zero_infinity: true
488
+ brctc_risk_strategy: exp
489
+ brctc_group_strategy: end
490
+ brctc_risk_factor: 0.0
491
+ joint_net_conf: null
492
+ use_preprocessor: true
493
+ use_lang_prompt: false
494
+ use_nlp_prompt: false
495
+ token_type: char
496
+ bpemodel: null
497
+ non_linguistic_symbols: data/nlsyms.txt
498
+ cleaner: null
499
+ g2p: null
500
+ speech_volume_normalize: null
501
+ rir_scp: null
502
+ rir_apply_prob: 1.0
503
+ noise_scp: null
504
+ noise_apply_prob: 1.0
505
+ noise_db_range: '13_15'
506
+ short_noise_thres: 0.5
507
+ aux_ctc_tasks: []
508
+ frontend: default
509
+ frontend_conf:
510
+ n_fft: 512
511
+ win_length: 400
512
+ hop_length: 160
513
+ fs: 16k
514
+ specaug: specaug
515
+ specaug_conf:
516
+ apply_time_warp: true
517
+ time_warp_window: 5
518
+ time_warp_mode: bicubic
519
+ apply_freq_mask: true
520
+ freq_mask_width_range:
521
+ - 0
522
+ - 27
523
+ num_freq_mask: 2
524
+ apply_time_mask: true
525
+ time_mask_width_ratio_range:
526
+ - 0.0
527
+ - 0.05
528
+ num_time_mask: 5
529
+ normalize: utterance_mvn
530
+ normalize_conf: {}
531
+ model: espnet
532
+ model_conf:
533
+ ctc_weight: 0.3
534
+ lsm_weight: 0.1
535
+ length_normalized_loss: false
536
+ preencoder: null
537
+ preencoder_conf: {}
538
+ encoder: conformer
539
+ encoder_conf:
540
+ output_size: 256
541
+ attention_heads: 4
542
+ linear_units: 1024
543
+ num_blocks: 8
544
+ dropout_rate: 0.1
545
+ positional_dropout_rate: 0.1
546
+ attention_dropout_rate: 0.1
547
+ input_layer: conv2d2
548
+ normalize_before: true
549
+ macaron_style: true
550
+ rel_pos_type: latest
551
+ pos_enc_layer_type: rel_pos
552
+ selfattention_layer_type: rel_selfattn
553
+ activation_type: swish
554
+ use_cnn_module: true
555
+ cnn_module_kernel: 31
556
+ postencoder: null
557
+ postencoder_conf: {}
558
+ decoder: transformer
559
+ decoder_conf:
560
+ attention_heads: 4
561
+ linear_units: 2048
562
+ num_blocks: 6
563
+ dropout_rate: 0.1
564
+ positional_dropout_rate: 0.1
565
+ self_attention_dropout_rate: 0.1
566
+ src_attention_dropout_rate: 0.1
567
+ layer_drop_rate: 0.0
568
+ preprocessor: default
569
+ preprocessor_conf: {}
570
+ required:
571
+ - output_dir
572
+ - token_list
573
+ version: '202503'
574
+ distributed: false
575
+ ```
576
+
577
+ </details>
578
+
579
+
580
+
581
+ ### Citing ESPnet
582
+
583
+ ```BibTex
584
+ @inproceedings{watanabe2018espnet,
585
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
586
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
587
+ year={2018},
588
+ booktitle={Proceedings of Interspeech},
589
+ pages={2207--2211},
590
+ doi={10.21437/Interspeech.2018-1456},
591
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
592
+ }
593
+
594
+
595
+
596
+
597
+
598
+
599
+ ```
600
+
601
+ or arXiv:
602
+
603
+ ```bibtex
604
+ @misc{watanabe2018espnet,
605
+ title={ESPnet: End-to-End Speech Processing Toolkit},
606
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
607
+ year={2018},
608
+ eprint={1804.00015},
609
+ archivePrefix={arXiv},
610
+ primaryClass={cs.CL}
611
+ }
612
+ ```
data_respin/data_asru25/nlsyms.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ [bh]
2
+ [bn]
3
+ [ch]
4
+ [kn]
5
+ [mg]
6
+ [mr]
7
+ [mt]
8
+ [te]
exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/RESULTS.md ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Generated by scripts/utils/show_asr_result.sh -->
2
+ # RESULTS
3
+ ## Environments
4
+ - date: `Wed Apr 23 00:04:53 IST 2025`
5
+ - python version: `3.10.12 (main, Feb 4 2025, 14:57:36) [GCC 11.4.0]`
6
+ - espnet version: `espnet 202503`
7
+ - pytorch version: `pytorch 2.3.0+cu121`
8
+ - Git hash: `0cc9d62673c1461efe37632aeab297a311bcd7f0`
9
+ - Commit date: `Sat Apr 12 17:53:44 2025 -0400`
10
+
11
+ ## exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/decode_lid_asr_model_valid.acc.ave
12
+ ### WER
13
+
14
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
15
+ |---|---|---|---|---|---|---|---|---|
16
+ |org/dev_lid|11507|111296|85.3|13.6|1.1|1.0|15.7|67.6|
17
+
18
+ ### CER
19
+
20
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
21
+ |---|---|---|---|---|---|---|---|---|
22
+ |org/dev_lid|11507|615241|97.3|1.6|1.0|0.9|3.5|67.6|
23
+
24
+ ### TER
25
+
26
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
27
+ |---|---|---|---|---|---|---|---|---|
exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/config.yaml ADDED
@@ -0,0 +1,511 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_asr_conformer_transformer_e8_linear1024_bs6M_gacc1.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: sequence
7
+ valid_iterator_type: null
8
+ output_dir: exp/asru25/exp_large_lid_indic_char/asr_noaux_con_e8_lin1024_bs6M_gacc1_ctc03_lid
9
+ ngpu: 1
10
+ seed: 2022
11
+ num_workers: 8
12
+ num_att_plot: 3
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: null
16
+ dist_rank: null
17
+ local_rank: 0
18
+ dist_master_addr: null
19
+ dist_master_port: null
20
+ dist_launcher: null
21
+ multiprocessing_distributed: false
22
+ unused_parameters: false
23
+ sharded_ddp: false
24
+ use_deepspeed: false
25
+ deepspeed_config: null
26
+ gradient_as_bucket_view: true
27
+ ddp_comm_hook: null
28
+ cudnn_enabled: true
29
+ cudnn_benchmark: false
30
+ cudnn_deterministic: true
31
+ use_tf32: false
32
+ collect_stats: false
33
+ write_collected_feats: false
34
+ max_epoch: 70
35
+ patience: 5
36
+ val_scheduler_criterion:
37
+ - valid
38
+ - loss
39
+ early_stopping_criterion:
40
+ - valid
41
+ - loss
42
+ - min
43
+ best_model_criterion:
44
+ - - valid
45
+ - acc
46
+ - max
47
+ keep_nbest_models: 5
48
+ nbest_averaging_interval: 0
49
+ grad_clip: 5.0
50
+ grad_clip_type: 2.0
51
+ grad_noise: false
52
+ accum_grad: 1
53
+ no_forward_run: false
54
+ resume: true
55
+ train_dtype: float32
56
+ use_amp: true
57
+ log_interval: null
58
+ use_matplotlib: true
59
+ use_tensorboard: true
60
+ create_graph_in_tensorboard: false
61
+ use_wandb: false
62
+ wandb_project: null
63
+ wandb_id: null
64
+ wandb_entity: null
65
+ wandb_name: null
66
+ wandb_model_log_interval: -1
67
+ detect_anomaly: false
68
+ use_adapter: false
69
+ adapter: lora
70
+ save_strategy: all
71
+ adapter_conf: {}
72
+ pretrain_path: null
73
+ init_param: []
74
+ ignore_init_mismatch: false
75
+ freeze_param: []
76
+ num_iters_per_epoch: null
77
+ batch_size: 20
78
+ valid_batch_size: null
79
+ batch_bins: 6000000
80
+ valid_batch_bins: null
81
+ category_sample_size: 10
82
+ train_shape_file:
83
+ - exp/asru25/exp_large_lid_indic_char/asr_stats_raw_large_lid_char_sp/train/speech_shape
84
+ - exp/asru25/exp_large_lid_indic_char/asr_stats_raw_large_lid_char_sp/train/text_shape.char
85
+ valid_shape_file:
86
+ - exp/asru25/exp_large_lid_indic_char/asr_stats_raw_large_lid_char_sp/valid/speech_shape
87
+ - exp/asru25/exp_large_lid_indic_char/asr_stats_raw_large_lid_char_sp/valid/text_shape.char
88
+ batch_type: numel
89
+ valid_batch_type: null
90
+ fold_length:
91
+ - 80000
92
+ - 150
93
+ sort_in_batch: descending
94
+ shuffle_within_batch: false
95
+ sort_batch: descending
96
+ multiple_iterator: false
97
+ chunk_length: 500
98
+ chunk_shift_ratio: 0.5
99
+ num_cache_chunks: 1024
100
+ chunk_excluded_key_prefixes: []
101
+ chunk_default_fs: null
102
+ chunk_max_abs_length: null
103
+ chunk_discard_short_samples: true
104
+ train_data_path_and_name_and_type:
105
+ - - dump/asru25_lid/raw/train_large_lid_sp/wav.scp
106
+ - speech
107
+ - sound
108
+ - - dump/asru25_lid/raw/train_large_lid_sp/text
109
+ - text
110
+ - text
111
+ valid_data_path_and_name_and_type:
112
+ - - dump/asru25_lid/raw/dev_lid/wav.scp
113
+ - speech
114
+ - sound
115
+ - - dump/asru25_lid/raw/dev_lid/text
116
+ - text
117
+ - text
118
+ multi_task_dataset: false
119
+ allow_variable_data_keys: false
120
+ max_cache_size: 0.0
121
+ max_cache_fd: 32
122
+ allow_multi_rates: false
123
+ valid_max_cache_size: null
124
+ exclude_weight_decay: false
125
+ exclude_weight_decay_conf: {}
126
+ optim: adam
127
+ optim_conf:
128
+ lr: 0.002
129
+ weight_decay: 1.0e-06
130
+ scheduler: warmuplr
131
+ scheduler_conf:
132
+ warmup_steps: 15000
133
+ token_list:
134
+ - <blank>
135
+ - <unk>
136
+ - <space>
137
+ - ा
138
+ - क
139
+ - े
140
+ - र
141
+ - ल
142
+ - न
143
+ - स
144
+ - ्
145
+ - त
146
+ - '['
147
+ - ']'
148
+ - म
149
+ - ी
150
+ - ि
151
+ - ್
152
+ - ह
153
+ - य
154
+ - ब
155
+ - प
156
+ - ो
157
+ - ్
158
+ - া
159
+ - व
160
+ - ज
161
+ - ं
162
+ - ు
163
+ - ా
164
+ - ి
165
+ - ಿ
166
+ - ে
167
+ - ಾ
168
+ - द
169
+ - র
170
+ - న
171
+ - ల
172
+ - ग
173
+ - ರ
174
+ - m
175
+ - ం
176
+ - క
177
+ - ক
178
+ - ర
179
+ - ು
180
+ - ನ
181
+ - ্
182
+ - ಕ
183
+ - ु
184
+ - च
185
+ - ट
186
+ - ತ
187
+ - ै
188
+ - ದ
189
+ - ি
190
+ - ख
191
+ - ೆ
192
+ - ಗ
193
+ - t
194
+ - ప
195
+ - త
196
+ - b
197
+ - h
198
+ - इ
199
+ - अ
200
+ - n
201
+ - आ
202
+ - ू
203
+ - ন
204
+ - ಸ
205
+ - ಲ
206
+ - వ
207
+ - ట
208
+ - స
209
+ - ವ
210
+ - ే
211
+ - छ
212
+ - ব
213
+ - ল
214
+ - ಯ
215
+ - भ
216
+ - श
217
+ - ಂ
218
+ - ಮ
219
+ - య
220
+ - য
221
+ - ध
222
+ - డ
223
+ - ద
224
+ - ए
225
+ - थ
226
+ - మ
227
+ - ಬ
228
+ - చ
229
+ - ण
230
+ - ड
231
+ - ম
232
+ - ई
233
+ - उ
234
+ - স
235
+ - ত
236
+ - ಳ
237
+ - ో
238
+ - ಡ
239
+ - फ
240
+ - g
241
+ - r
242
+ - e
243
+ - గ
244
+ - প
245
+ - ট
246
+ - য়
247
+ - c
248
+ - k
249
+ - ಟ
250
+ - ె
251
+ - .
252
+ - ಹ
253
+ - ಪ
254
+ - ೇ
255
+ - బ
256
+ - হ
257
+ - ু
258
+ - দ
259
+ - ष
260
+ - ো
261
+ - ీ
262
+ - জ
263
+ - ड़
264
+ - ಅ
265
+ - ೊ
266
+ - ই
267
+ - গ
268
+ - అ
269
+ - घ
270
+ - ಣ
271
+ - ठ
272
+ - ೋ
273
+ - চ
274
+ - ँ
275
+ - ొ
276
+ - ौ
277
+ - ছ
278
+ - ఎ
279
+ - ओ
280
+ - শ
281
+ - আ
282
+ - ూ
283
+ - జ
284
+ - ಜ
285
+ - থ
286
+ - ভ
287
+ - ಇ
288
+ - ೂ
289
+ - ಷ
290
+ - ಚ
291
+ - এ
292
+ - ষ
293
+ - ై
294
+ - ೀ
295
+ - ఉ
296
+ - ಎ
297
+ - ಆ
298
+ - ळ
299
+ - ধ
300
+ - ृ
301
+ - ী
302
+ - উ
303
+ - ফ
304
+ - খ
305
+ - ড
306
+ - ॉ
307
+ - ಶ
308
+ - অ
309
+ - ೈ
310
+ - ధ
311
+ - ং
312
+ - झ
313
+ - ఇ
314
+ - ఆ
315
+ - ష
316
+ - ढ
317
+ - ढ़
318
+ - భ
319
+ - శ
320
+ - ఏ
321
+ - ಧ
322
+ - ও
323
+ - ಒ
324
+ - ಭ
325
+ - ళ
326
+ - ಉ
327
+ - ॅ
328
+ - ಫ
329
+ - ऊ
330
+ - ఫ
331
+ - ಥ
332
+ - ऑ
333
+ - ణ
334
+ - ড়
335
+ - ণ
336
+ - ঙ
337
+ - ऋ
338
+ - ಖ
339
+ - ऽ
340
+ - హ
341
+ - థ
342
+ - औ
343
+ - ೃ
344
+ - ঁ
345
+ - ೌ
346
+ - ఒ
347
+ - ఖ
348
+ - ৃ
349
+ - ఈ
350
+ - ಏ
351
+ - ঠ
352
+ - ౌ
353
+ - ಐ
354
+ - ৈ
355
+ - ऐ
356
+ - ऱ
357
+ - ఐ
358
+ - ূ
359
+ - ञ
360
+ - ৎ
361
+ - ృ
362
+ - ज़
363
+ - ঞ
364
+ - ಈ
365
+ - ঘ
366
+ - ঋ
367
+ - ঝ
368
+ - फ़
369
+ - ৌ
370
+ - ಠ
371
+ - ः
372
+ - ఓ
373
+ - ಘ
374
+ - ಛ
375
+ - ಓ
376
+ - ఊ
377
+ - ఋ
378
+ - ಔ
379
+ - ఛ
380
+ - ಞ
381
+ - ॲ
382
+ - ಊ
383
+ - ఘ
384
+ - ঢ
385
+ - ख़
386
+ - ়
387
+ - ऍ
388
+ - ಋ
389
+ - क़
390
+ - ఠ
391
+ - ঢ়
392
+ - ঃ
393
+ - ़
394
+ - ೕ
395
+ - ఔ
396
+ - ಢ
397
+ - ঊ
398
+ - ఱ
399
+ - ಃ
400
+ - ಝ
401
+ - ङ
402
+ - ఢ
403
+ - ग़
404
+ - ఞ
405
+ - ঐ
406
+ - ঔ
407
+ - ॠ
408
+ - ':'
409
+ - ೯
410
+ - ೖ
411
+ - ঈ
412
+ - ః
413
+ - ౖ
414
+ - ৠ
415
+ - ౦
416
+ - <sos/eos>
417
+ init: null
418
+ input_size: null
419
+ ctc_conf:
420
+ dropout_rate: 0.0
421
+ ctc_type: builtin
422
+ reduce: true
423
+ ignore_nan_grad: null
424
+ zero_infinity: true
425
+ brctc_risk_strategy: exp
426
+ brctc_group_strategy: end
427
+ brctc_risk_factor: 0.0
428
+ joint_net_conf: null
429
+ use_preprocessor: true
430
+ use_lang_prompt: false
431
+ use_nlp_prompt: false
432
+ token_type: char
433
+ bpemodel: null
434
+ non_linguistic_symbols: data/nlsyms.txt
435
+ cleaner: null
436
+ g2p: null
437
+ speech_volume_normalize: null
438
+ rir_scp: null
439
+ rir_apply_prob: 1.0
440
+ noise_scp: null
441
+ noise_apply_prob: 1.0
442
+ noise_db_range: '13_15'
443
+ short_noise_thres: 0.5
444
+ aux_ctc_tasks: []
445
+ frontend: default
446
+ frontend_conf:
447
+ n_fft: 512
448
+ win_length: 400
449
+ hop_length: 160
450
+ fs: 16k
451
+ specaug: specaug
452
+ specaug_conf:
453
+ apply_time_warp: true
454
+ time_warp_window: 5
455
+ time_warp_mode: bicubic
456
+ apply_freq_mask: true
457
+ freq_mask_width_range:
458
+ - 0
459
+ - 27
460
+ num_freq_mask: 2
461
+ apply_time_mask: true
462
+ time_mask_width_ratio_range:
463
+ - 0.0
464
+ - 0.05
465
+ num_time_mask: 5
466
+ normalize: utterance_mvn
467
+ normalize_conf: {}
468
+ model: espnet
469
+ model_conf:
470
+ ctc_weight: 0.3
471
+ lsm_weight: 0.1
472
+ length_normalized_loss: false
473
+ preencoder: null
474
+ preencoder_conf: {}
475
+ encoder: conformer
476
+ encoder_conf:
477
+ output_size: 256
478
+ attention_heads: 4
479
+ linear_units: 1024
480
+ num_blocks: 8
481
+ dropout_rate: 0.1
482
+ positional_dropout_rate: 0.1
483
+ attention_dropout_rate: 0.1
484
+ input_layer: conv2d2
485
+ normalize_before: true
486
+ macaron_style: true
487
+ rel_pos_type: latest
488
+ pos_enc_layer_type: rel_pos
489
+ selfattention_layer_type: rel_selfattn
490
+ activation_type: swish
491
+ use_cnn_module: true
492
+ cnn_module_kernel: 31
493
+ postencoder: null
494
+ postencoder_conf: {}
495
+ decoder: transformer
496
+ decoder_conf:
497
+ attention_heads: 4
498
+ linear_units: 2048
499
+ num_blocks: 6
500
+ dropout_rate: 0.1
501
+ positional_dropout_rate: 0.1
502
+ self_attention_dropout_rate: 0.1
503
+ src_attention_dropout_rate: 0.1
504
+ layer_drop_rate: 0.0
505
+ preprocessor: default
506
+ preprocessor_conf: {}
507
+ required:
508
+ - output_dir
509
+ - token_list
510
+ version: '202503'
511
+ distributed: false
exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/acc.png ADDED
exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/backward_time.png ADDED
exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/cer.png ADDED
exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/cer_ctc.png ADDED
exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/clip.png ADDED
exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/forward_time.png ADDED
exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/gpu_max_cached_mem_GB.png ADDED
exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/grad_norm.png ADDED
exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/iter_time.png ADDED
exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/loss.png ADDED
exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/loss_att.png ADDED
exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/loss_ctc.png ADDED
exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/loss_scale.png ADDED
exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/optim0_lr0.png ADDED
exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/optim_step_time.png ADDED
exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/train_time.png ADDED
exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/images/wer.png ADDED
exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/valid.acc.ave_5best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e95fdc2cca38ece938653d8033d2da3729981d780fd931ad56080593d63c5838
3
+ size 101971162
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202503'
2
+ files:
3
+ asr_model_file: exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/valid.acc.ave_5best.pth
4
+ python: 3.10.12 (main, Feb 4 2025, 14:57:36) [GCC 11.4.0]
5
+ timestamp: 1745346895.184633
6
+ torch: 2.3.0+cu121
7
+ yaml_files:
8
+ asr_train_config: exp/asru25/exp_large/asr_multilingual_lid_con_e8_lin1024_bs6M_gacc1_ctc03/config.yaml