ankitapasad commited on
Commit
83335b4
·
verified ·
1 Parent(s): 6c0383f

removed repeat "layers of"

Browse files
Files changed (1) hide show
  1. README.md +585 -585
README.md CHANGED
@@ -1,585 +1,585 @@
1
- ---
2
- license: cc-by-4.0
3
- language:
4
- - en
5
- - de
6
- - es
7
- - fr
8
- library_name: nemo
9
- datasets:
10
- - librispeech_asr
11
- - fisher_corpus
12
- - Switchboard-1
13
- - WSJ-0
14
- - WSJ-1
15
- - National-Singapore-Corpus-Part-1
16
- - National-Singapore-Corpus-Part-6
17
- - vctk
18
- - voxpopuli
19
- - europarl
20
- - multilingual_librispeech
21
- - mozilla-foundation/common_voice_8_0
22
- - MLCommons/peoples_speech
23
- thumbnail: null
24
- tags:
25
- - automatic-speech-recognition
26
- - automatic-speech-translation
27
- - speech
28
- - audio
29
- - Transformer
30
- - FastConformer
31
- - Conformer
32
- - pytorch
33
- - NeMo
34
- - hf-asr-leaderboard
35
- widget:
36
- - example_title: Librispeech sample 1
37
- src: https://cdn-media.huggingface.co/speech_samples/sample1.flac
38
- - example_title: Librispeech sample 2
39
- src: https://cdn-media.huggingface.co/speech_samples/sample2.flac
40
- model-index:
41
- - name: canary-1b-flash
42
- results:
43
- - task:
44
- name: Automatic Speech Recognition
45
- type: automatic-speech-recognition
46
- dataset:
47
- name: LibriSpeech (other)
48
- type: librispeech_asr
49
- config: other
50
- split: test
51
- args:
52
- language: en
53
- metrics:
54
- - name: Test WER
55
- type: wer
56
- value: 2.87
57
- - task:
58
- type: Automatic Speech Recognition
59
- name: automatic-speech-recognition
60
- dataset:
61
- name: SPGI Speech
62
- type: kensho/spgispeech
63
- config: test
64
- split: test
65
- args:
66
- language: en
67
- metrics:
68
- - name: Test WER
69
- type: wer
70
- value: 1.95
71
- - task:
72
- type: Automatic Speech Recognition
73
- name: automatic-speech-recognition
74
- dataset:
75
- name: Mozilla Common Voice 16.1
76
- type: mozilla-foundation/common_voice_16_1
77
- config: en
78
- split: test
79
- args:
80
- language: en
81
- metrics:
82
- - name: Test WER (En)
83
- type: wer
84
- value: 6.99
85
- - task:
86
- type: Automatic Speech Recognition
87
- name: automatic-speech-recognition
88
- dataset:
89
- name: Mozilla Common Voice 16.1
90
- type: mozilla-foundation/common_voice_16_1
91
- config: de
92
- split: test
93
- args:
94
- language: de
95
- metrics:
96
- - name: Test WER (De)
97
- type: wer
98
- value: 4.03
99
- - task:
100
- type: Automatic Speech Recognition
101
- name: automatic-speech-recognition
102
- dataset:
103
- name: Mozilla Common Voice 16.1
104
- type: mozilla-foundation/common_voice_16_1
105
- config: es
106
- split: test
107
- args:
108
- language: es
109
- metrics:
110
- - name: Test WER (ES)
111
- type: wer
112
- value: 3.31
113
- - task:
114
- type: Automatic Speech Recognition
115
- name: automatic-speech-recognition
116
- dataset:
117
- name: Mozilla Common Voice 16.1
118
- type: mozilla-foundation/common_voice_16_1
119
- config: fr
120
- split: test
121
- args:
122
- language: fr
123
- metrics:
124
- - name: Test WER (Fr)
125
- type: wer
126
- value: 5.88
127
- - task:
128
- type: Automatic Speech Translation
129
- name: automatic-speech-translation
130
- dataset:
131
- name: FLEURS
132
- type: google/fleurs
133
- config: en_us
134
- split: test
135
- args:
136
- language: en-de
137
- metrics:
138
- - name: Test BLEU (En->De)
139
- type: bleu
140
- value: 32.27
141
- - task:
142
- type: Automatic Speech Translation
143
- name: automatic-speech-translation
144
- dataset:
145
- name: FLEURS
146
- type: google/fleurs
147
- config: en_us
148
- split: test
149
- args:
150
- language: en-de
151
- metrics:
152
- - name: Test BLEU (En->Es)
153
- type: bleu
154
- value: 22.6
155
- - task:
156
- type: Automatic Speech Translation
157
- name: automatic-speech-translation
158
- dataset:
159
- name: FLEURS
160
- type: google/fleurs
161
- config: en_us
162
- split: test
163
- args:
164
- language: en-de
165
- metrics:
166
- - name: Test BLEU (En->Fr)
167
- type: bleu
168
- value: 41.22
169
- - task:
170
- type: Automatic Speech Translation
171
- name: automatic-speech-translation
172
- dataset:
173
- name: FLEURS
174
- type: google/fleurs
175
- config: de_de
176
- split: test
177
- args:
178
- language: de-en
179
- metrics:
180
- - name: Test BLEU (De->En)
181
- type: bleu
182
- value: 35.5
183
- - task:
184
- type: Automatic Speech Translation
185
- name: automatic-speech-translation
186
- dataset:
187
- name: FLEURS
188
- type: google/fleurs
189
- config: es_419
190
- split: test
191
- args:
192
- language: es-en
193
- metrics:
194
- - name: Test BLEU (Es->En)
195
- type: bleu
196
- value: 23.32
197
- - task:
198
- type: Automatic Speech Translation
199
- name: automatic-speech-translation
200
- dataset:
201
- name: FLEURS
202
- type: google/fleurs
203
- config: fr_fr
204
- split: test
205
- args:
206
- language: fr-en
207
- metrics:
208
- - name: Test BLEU (Fr->En)
209
- type: bleu
210
- value: 33.42
211
- - task:
212
- type: Automatic Speech Translation
213
- name: automatic-speech-translation
214
- dataset:
215
- name: COVOST
216
- type: covost2
217
- config: de_de
218
- split: test
219
- args:
220
- language: de-en
221
- metrics:
222
- - name: Test BLEU (De->En)
223
- type: bleu
224
- value: 39.33
225
- - task:
226
- type: Automatic Speech Translation
227
- name: automatic-speech-translation
228
- dataset:
229
- name: COVOST
230
- type: covost2
231
- config: es_419
232
- split: test
233
- args:
234
- language: es-en
235
- metrics:
236
- - name: Test BLEU (Es->En)
237
- type: bleu
238
- value: 41.86
239
- - task:
240
- type: Automatic Speech Translation
241
- name: automatic-speech-translation
242
- dataset:
243
- name: COVOST
244
- type: covost2
245
- config: fr_fr
246
- split: test
247
- args:
248
- language: fr-en
249
- metrics:
250
- - name: Test BLEU (Fr->En)
251
- type: bleu
252
- value: 41.43
253
-
254
- metrics:
255
- - wer
256
- - bleu
257
- pipeline_tag: automatic-speech-recognition
258
- ---
259
-
260
- # Canary 1B Flash
261
-
262
- <style>
263
- img {
264
- display: inline;
265
- }
266
- </style>
267
-
268
- ## Description:
269
- NVIDIA NeMo Canary [1] is a family of multilingual multi-tasking models that achieves state-of-the art performance on multiple speech benchmarks. With 883 million parameters and running at more then 900 RTFx (on open-asr-leaderboard sets), canary-1b-flash supports automatic speech-to-text recognition (ASR) in 4 languages (English, German, French, Spanish) and translation from English to German/French/Spanish and from German/French/Spanish to English with or without punctuation and capitalization (PnC). In addition to this, canary-1b-flash also supports functionality for word-level and segment-level timestamps for English, German, French, and Spanish. This model is released under the permissive CC-BY-4.0 license and is available for commercial use.
270
-
271
-
272
- ## Model Architecture:
273
- Canary is an encoder-decoder model with FastConformer [2] Encoder and Transformer Decoder [3]. With audio features extracted from the encoder, task tokens such as \<target language\>, \<task\>, \<toggle timestamps\> and \<toggle PnC\> are fed into the Transformer Decoder to trigger the text generation process. Canary uses a concatenated tokenizer [4] from individual SentencePiece [5] tokenizers of each language, which makes it easy to scale up to more languages. The canary-1b-flash model has 32 encoder layers and 4 layers of decoder layers, leading to a total of 883M parameters. For more details about the architecture, please refer to [9].
274
-
275
- ## NVIDIA NeMo
276
-
277
- To train, fine-tune or transcribe with canary-1b-flash, you will need to install [NVIDIA NeMo](https://github.com/NVIDIA/NeMo).
278
-
279
- ## How to Use this Model
280
-
281
- The model is available for use in the NeMo toolkit [4], and can be used as a pre-trained checkpoint for inference or for fine-tuning on another dataset.
282
-
283
- ### Loading the Model
284
-
285
- ```python
286
- from nemo.collections.asr.models import EncDecMultiTaskModel
287
- # load model
288
- canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b-flash')
289
- # update dcode params
290
- decode_cfg = canary_model.cfg.decoding
291
- decode_cfg.beam.beam_size = 1
292
- canary_model.change_decoding_strategy(decode_cfg)
293
- ```
294
-
295
- ## Input:
296
- **Input Type(s):** Audio <br>
297
- **Input Format(s):** .wav or .flac files<br>
298
- **Input Parameters(s):** 1D <br>
299
- **Other Properties Related to Input:** 16000 Hz Mono-channel Audio, Pre-Processing Not Needed <br>
300
-
301
- Input to canary-1b-flash can be either a list of paths to audio files or a jsonl manifest file.
302
-
303
- If the input is a list of paths, canary-1b-flash assumes that the audio is English and transcribes it. I.e., canary-1b-flash default behavior is English ASR.
304
- ```python
305
- output = canary_model.transcribe(
306
- ['path1.wav', 'path2.wav'],
307
- batch_size=16, # batch size to run the inference with
308
- pnc=True, # generate output with Punctuation and Capitalization
309
- )
310
-
311
- predicted_text_1 = output[0].text
312
-
313
- ```
314
-
315
- canary-1b-flash can also generate word and segment level timestamps
316
- ```python
317
- output = canary_model.transcribe(
318
- ['filepath.wav'],
319
- timestamps='yes', # generate output with timestamps
320
- )
321
-
322
- predicted_text = output[0].text
323
- word_level_timestamps = output[0].timestamp['word']
324
- segment_level_timestamps = output[0].timestamp['segment']
325
-
326
- ```
327
-
328
- To use canary-1b-flash for transcribing other supported languages or perform Speech-to-Text translation or provide word-level timestamps, specify the input as jsonl manifest file, where each line in the file is a dictionary containing the following fields:
329
-
330
- ```yaml
331
- # Example of a line in input_manifest.json
332
- {
333
- "audio_filepath": "/path/to/audio.wav", # path to the audio file
334
- "duration": 1000, # duration of the audio, can be set to `None` if using NeMo main branch
335
- "taskname": "asr", # use "s2t_translation" for speech-to-text translation with r1.23, or "ast" if using the NeMo main branch
336
- "source_lang": "en", # language of the audio input, set `source_lang`==`target_lang` for ASR, choices=['en','de','es','fr']
337
- "target_lang": "en", # language of the text output, choices=['en','de','es','fr']
338
- "pnc": "yes", # whether to have PnC output, choices=['yes', 'no']
339
- "timestamp": "yes", # whether to output word-level timestamps, choices=['yes', 'no']
340
- }
341
- ```
342
-
343
- and then use:
344
- ```python
345
- output = canary_model.transcribe(
346
- "<path to input manifest file>",
347
- batch_size=16, # batch size to run the inference with
348
- )
349
- ```
350
-
351
- ## Output:
352
- **Output Type(s):** Text <br>
353
- **Output Format:** Text output as a string (w/ timestamps) depending on the task chosen for decoding <br>
354
- **Output Parameters:** 1-Dimensional text string <br>
355
- **Other Properties Related to Output:** May Need Inverse Text Normalization; Does Not Handle Special Characters <br>
356
-
357
-
358
- ## Software Integration:
359
- **Runtime Engine(s):**
360
- * NeMo - 2.1.0 or higher <br>
361
-
362
- **Supported Hardware Microarchitecture Compatibility:** <br>
363
- * [NVIDIA Ampere] <br>
364
- * [NVIDIA Blackwell] <br>
365
- * [NVIDIA Jetson] <br>
366
- * [NVIDIA Hopper] <br>
367
- * [NVIDIA Lovelace] <br>
368
- * [NVIDIA Pascal] <br>
369
- * [NVIDIA Turing] <br>
370
- * [NVIDIA Volta] <br>
371
-
372
- **[Preferred/Supported] Operating System(s):** <br>
373
- * [Linux] <br>
374
- * [Linux 4 Tegra] <br>
375
- * [Windows] <br>
376
-
377
- ## Model Version(s):
378
- canary-1b-flash <br>
379
-
380
-
381
- # Training and Evaluation Datasets:
382
-
383
- ## Training Dataset:
384
-
385
- The canary-1b-flash model is trained on a total of 85K hrs of speech data. It consists of 31K hrs of public data, 20K hrs collected by [Suno](https://suno.ai/), and 34K hrs of in-house data.
386
- The datasets below include conversations, videos from the web and audiobook recordings.
387
-
388
- **Data Collection Method:**
389
- * Human <br>
390
-
391
- **Labeling Method:**
392
- * Hybrid: Human, Automated <br>
393
-
394
- The constituents of public data are as follows.
395
-
396
- #### English (25.5k hours)
397
- - Librispeech 960 hours
398
- - Fisher Corpus
399
- - Switchboard-1 Dataset
400
- - WSJ-0 and WSJ-1
401
- - National Speech Corpus (Part 1, Part 6)
402
- - VCTK
403
- - VoxPopuli (EN)
404
- - Europarl-ASR (EN)
405
- - Multilingual Librispeech (MLS EN) - 2,000 hour subset
406
- - Mozilla Common Voice (v7.0)
407
- - People's Speech - 12,000 hour subset
408
- - Mozilla Common Voice (v11.0) - 1,474 hour subset
409
-
410
- #### German (2.5k hours)
411
- - Mozilla Common Voice (v12.0) - 800 hour subset
412
- - Multilingual Librispeech (MLS DE) - 1,500 hour subset
413
- - VoxPopuli (DE) - 200 hr subset
414
-
415
- #### Spanish (1.4k hours)
416
- - Mozilla Common Voice (v12.0) - 395 hour subset
417
- - Multilingual Librispeech (MLS ES) - 780 hour subset
418
- - VoxPopuli (ES) - 108 hour subset
419
- - Fisher - 141 hour subset
420
-
421
- #### French (1.8k hours)
422
- - Mozilla Common Voice (v12.0) - 708 hour subset
423
- - Multilingual Librispeech (MLS FR) - 926 hour subset
424
- - VoxPopuli (FR) - 165 hour subset
425
-
426
-
427
- ## Evaluation Dataset:
428
-
429
- **Data Collection Method:** <br>
430
- * Human <br>
431
-
432
- **Labeling Method:** <br>
433
- * Human <br>
434
-
435
- Automatic Speech Recognition:
436
- * [HuggingFace OpenASR Leaderboard evaluation sets](https://huggingface.co/spaces/hf-audio/open_asr_leaderboard)
437
- * [MLS](https://huggingface.co/datasets/facebook/multilingual_librispeech)
438
- * [MCV] (https://commonvoice.mozilla.org/en/datasets)
439
-
440
- Automatic Speech Translation:
441
- * [FLEURS](https://huggingface.co/datasets/google/fleurs)
442
- * [COVOST-v2](https://github.com/facebookresearch/covost)
443
- * [mExpresso](https://huggingface.co/facebook/seamless-expressive#mexpresso-multilingual-expresso)
444
-
445
- Timestamp Prediction:
446
- * [Librispeech](https://www.openslr.org/12)
447
-
448
- Hallucination Robustness:
449
- * [MUSAN](https://www.openslr.org/17/) 48 hrs eval set
450
-
451
- Noise Robustness:
452
- * [Librispeech](https://www.openslr.org/12)
453
-
454
- Model Fairness:
455
- * [Casual Conversations Dataset](https://arxiv.org/pdf/2104.02821)
456
-
457
- ## Training
458
-
459
- canary-1b-flash is trained using the NVIDIA NeMo toolkit [6] for a total of 200K steps with 2D bucketing [9] and optimal batch sizes set using OOMptimizer [7].The model is trained on 128 NVIDIA A100 80GB GPUs.
460
- The model can be trained using this [example script](https://github.com/NVIDIA/NeMo/blob/main/examples/asr/speech_multitask/speech_to_text_aed.py) and [base config](https://github.com/NVIDIA/NeMo/blob/main/examples/asr/conf/speech_multitask/fast-conformer_aed.yaml).
461
-
462
- The tokenizers for these models were built using the text transcripts of the train set with this [script](https://github.com/NVIDIA/NeMo/blob/main/scripts/tokenizers/process_asr_text_tokenizer.py).
463
-
464
- ## Inference:
465
- **Engine:** NVIDIA NeMo <br>
466
- **Test Hardware :** <br>
467
- * A6000 <br>
468
- * A100 <br>
469
- * V100 <br>
470
-
471
- ## Performance
472
-
473
- In both ASR and AST experiments, predictions were generated using beam search with width 5 and length penalty 1.0.
474
-
475
- ### ASR Performance (w/o PnC)
476
-
477
- The ASR performance is measured with word error rate (WER), and we process the groundtruth and predicted text with [whisper-normalizer](https://pypi.org/project/whisper-normalizer/).
478
-
479
- WER on [HuggingFace OpenASR leaderboard](https://huggingface.co/spaces/hf-audio/open_asr_leaderboard):
480
-
481
- | **Version** | **Model** | **RTFx** | **AMI** | **GigaSpeech** | **LS Clean** | **LS Other** | **Earnings22** | **SPGISpech** | **Tedlium** | **Voxpopuli** |
482
- |:---------:|:-----------:|:------:|:------:|:------:|:------:|:------:|:------:|:------:|:------:|:------:|
483
- | 2.2.0 | canary-1b-flash | 928.19 | 13.08 | 9.88 | 1.48 | 2.87 | 12.77 | 1.95 | 3.09 | 5.64 |
484
-
485
- WER on [MLS](https://huggingface.co/datasets/facebook/multilingual_librispeech) test set:
486
-
487
- | **Version** | **Model** | **De** | **Es** | **Fr** |
488
- |:---------:|:-----------:|:------:|:------:|:------:|
489
- | 2.2.0 | canary-1b-flash | 4.36 | 2.69 | 4.47 |
490
-
491
- WER on [MCV-16.1](https://commonvoice.mozilla.org/en/datasets) test set:
492
- | **Version** | **Model** | **En** | **De** | **Es** | **Fr** |
493
- |:---------:|:-----------:|:------:|:------:|:------:|:------:|
494
- | 2.2.0 | canary-1b-flash | 6.99 | 4.03 | 3.31 | 5.88 |
495
-
496
-
497
- More details on evaluation can be found at [HuggingFace ASR Leaderboard](https://huggingface.co/spaces/hf-audio/open_asr_leaderboard)
498
-
499
- ### AST Performance
500
-
501
- We evaluate AST performance with [BLEU score](https://lightning.ai/docs/torchmetrics/stable/text/sacre_bleu_score.html), and use native annotations with punctuation and capitalization in the datasets.
502
-
503
- BLEU score on [FLEURS](https://huggingface.co/datasets/google/fleurs) test set:
504
-
505
- | **Version** | **Model** | **En->De** | **En->Es** | **En->Fr** | **De->En** | **Es->En** | **Fr->En** |
506
- |:-----------:|:---------:|:----------:|:----------:|:----------:|:----------:|:----------:|:----------:|
507
- | 2.2.0 | canary-1b-flash | 32.27 | 22.6 | 41.22 | 35.5 | 23.32 | 33.42 |
508
-
509
-
510
- BLEU score on [COVOST-v2](https://github.com/facebookresearch/covost) test set:
511
-
512
- | **Version** | **Model** | **De->En** | **Es->En** | **Fr->En** |
513
- |:-----------:|:---------:|:----------:|:----------:|:----------:|
514
- | 2.2.0 | canary-1b-flash | 39.33 | 41.86 | 41.43 |
515
-
516
- BLEU score on [mExpresso](https://huggingface.co/facebook/seamless-expressive#mexpresso-multilingual-expresso) test set:
517
-
518
- | **Version** | **Model** | **En->De** | **En->Es** | **En->Fr** |
519
- |:-----------:|:---------:|:----------:|:----------:|:----------:|
520
- | 2.2.0 | canary-1b-flash | 22.91 | 35.69 | 27.85 |
521
-
522
- ### Timestamp Prediction
523
- F1-score on [Librispeech Test sets](https://www.openslr.org/12) at collar value of 200ms
524
- | **Version** | **Model** | **test-clean** | **test-other** |
525
- |:-----------:|:---------:|:----------:|:----------:|
526
- | 2.2.0 | canary-1b-flash | 95.5 | 93.5 |
527
-
528
- ### Hallucination Robustness
529
- Number of characters per minute on [MUSAN](https://www.openslr.org/17) 48 hrs eval set
530
- | **Version** | **Model** | **# of character per minute** |
531
- |:-----------:|:---------:|:----------:|
532
- | 2.2.0 | canary-1b-flash | 60.92 |
533
-
534
- ### Noise Robustness
535
- WER on [Librispeech Test Clean](https://www.openslr.org/12) at different SNR (signal to noise ratio) levels of additive white noise
536
-
537
- | **Version** | **Model** | **SNR 10** | **SNR 5** | **SNR 0** | **SNR -5** |
538
- |:-----------:|:---------:|:----------:|:----------:|:----------:|:----------:|
539
- | 2.2.0 | canary-1b-flash | 2.34 | 3.69 | 8.84 | 29.71 |
540
-
541
- ## Model Fairness Evaluation
542
-
543
- As outlined in the paper "Towards Measuring Fairness in AI: the Casual Conversations Dataset" [8], we assessed the canary-1b-flash model for fairness. The model was evaluated on the CausalConversations-v1 dataset, and the results are reported as follows:
544
-
545
- ### Gender Bias:
546
-
547
- | Gender | Male | Female | N/A | Other |
548
- | :--- | :--- | :--- | :--- | :--- |
549
- | Num utterances | 19325 | 24532 | 926 | 33 |
550
- | % WER | 14.66 | 12.44 | 17.17 | 27.56 |
551
-
552
- ### Age Bias:
553
-
554
- | Age Group | (18-30) | (31-45) | (46-85) | (1-100) |
555
- | :--- | :--- | :--- | :--- | :--- |
556
- | Num utterances | 15956 | 14585 | 13349 | 43890 |
557
- | % WER | 13.18 | 13.45 | 13.64 | 13.41 |
558
-
559
- (Error rates for fairness evaluation are determined by normalizing both the reference and predicted text, similar to the methods used in the evaluations found at https://github.com/huggingface/open_asr_leaderboard.)
560
-
561
- ## License/Terms of Use:
562
- canary-1b-flash is released under the CC-BY-4.0 license. By using this model, you are agreeing to the [terms and conditions](https://choosealicense.com/licenses/cc-by-4.0/) of the license. <br>
563
-
564
- ## References:
565
- [1] [Less is More: Accurate Speech Recognition & Translation without Web-Scale Data](https://www.isca-archive.org/interspeech_2024/puvvada24_interspeech.pdf) <br>
566
- [2] [Fast Conformer with Linearly Scalable Attention for Efficient Speech Recognition](https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=10389701)
567
-
568
- [3] [Attention Is All You Need](https://arxiv.org/abs/1706.03762)
569
-
570
- [4] [Unified Model for Code-Switching Speech Recognition and Language Identification Based on Concatenated Tokenizer](https://aclanthology.org/2023.calcs-1.7.pdf)
571
-
572
- [5] [Google Sentencepiece Tokenizer](https://github.com/google/sentencepiece)
573
-
574
- [6] [NVIDIA NeMo Toolkit](https://github.com/NVIDIA/NeMo)
575
-
576
- [7] [EMMeTT: Efficient Multimodal Machine Translation Training](https://arxiv.org/abs/2409.13523)
577
-
578
- [8] [Towards Measuring Fairness in AI: the Casual Conversations Dataset](https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=9634168)
579
-
580
- [9] [Training and Inference Efficiency of Encoder-Decoder Speech Models](https://arxiv.org/pdf/2503.05931)
581
-
582
- ## Ethical Considerations:
583
- NVIDIA believes Trustworthy AI is a shared responsibility and we have established policies and practices to enable development for a wide array of AI applications. When downloaded or used in accordance with our terms of service, developers should work with their internal model team to ensure this model meets requirements for the relevant industry and use case and addresses unforeseen product misuse.
584
- Please report security vulnerabilities or NVIDIA AI Concerns [here](https://www.nvidia.com/en-us/support/submit-security-vulnerability/).
585
-
 
1
+ ---
2
+ license: cc-by-4.0
3
+ language:
4
+ - en
5
+ - de
6
+ - es
7
+ - fr
8
+ library_name: nemo
9
+ datasets:
10
+ - librispeech_asr
11
+ - fisher_corpus
12
+ - Switchboard-1
13
+ - WSJ-0
14
+ - WSJ-1
15
+ - National-Singapore-Corpus-Part-1
16
+ - National-Singapore-Corpus-Part-6
17
+ - vctk
18
+ - voxpopuli
19
+ - europarl
20
+ - multilingual_librispeech
21
+ - mozilla-foundation/common_voice_8_0
22
+ - MLCommons/peoples_speech
23
+ thumbnail: null
24
+ tags:
25
+ - automatic-speech-recognition
26
+ - automatic-speech-translation
27
+ - speech
28
+ - audio
29
+ - Transformer
30
+ - FastConformer
31
+ - Conformer
32
+ - pytorch
33
+ - NeMo
34
+ - hf-asr-leaderboard
35
+ widget:
36
+ - example_title: Librispeech sample 1
37
+ src: https://cdn-media.huggingface.co/speech_samples/sample1.flac
38
+ - example_title: Librispeech sample 2
39
+ src: https://cdn-media.huggingface.co/speech_samples/sample2.flac
40
+ model-index:
41
+ - name: canary-1b-flash
42
+ results:
43
+ - task:
44
+ name: Automatic Speech Recognition
45
+ type: automatic-speech-recognition
46
+ dataset:
47
+ name: LibriSpeech (other)
48
+ type: librispeech_asr
49
+ config: other
50
+ split: test
51
+ args:
52
+ language: en
53
+ metrics:
54
+ - name: Test WER
55
+ type: wer
56
+ value: 2.87
57
+ - task:
58
+ type: Automatic Speech Recognition
59
+ name: automatic-speech-recognition
60
+ dataset:
61
+ name: SPGI Speech
62
+ type: kensho/spgispeech
63
+ config: test
64
+ split: test
65
+ args:
66
+ language: en
67
+ metrics:
68
+ - name: Test WER
69
+ type: wer
70
+ value: 1.95
71
+ - task:
72
+ type: Automatic Speech Recognition
73
+ name: automatic-speech-recognition
74
+ dataset:
75
+ name: Mozilla Common Voice 16.1
76
+ type: mozilla-foundation/common_voice_16_1
77
+ config: en
78
+ split: test
79
+ args:
80
+ language: en
81
+ metrics:
82
+ - name: Test WER (En)
83
+ type: wer
84
+ value: 6.99
85
+ - task:
86
+ type: Automatic Speech Recognition
87
+ name: automatic-speech-recognition
88
+ dataset:
89
+ name: Mozilla Common Voice 16.1
90
+ type: mozilla-foundation/common_voice_16_1
91
+ config: de
92
+ split: test
93
+ args:
94
+ language: de
95
+ metrics:
96
+ - name: Test WER (De)
97
+ type: wer
98
+ value: 4.03
99
+ - task:
100
+ type: Automatic Speech Recognition
101
+ name: automatic-speech-recognition
102
+ dataset:
103
+ name: Mozilla Common Voice 16.1
104
+ type: mozilla-foundation/common_voice_16_1
105
+ config: es
106
+ split: test
107
+ args:
108
+ language: es
109
+ metrics:
110
+ - name: Test WER (ES)
111
+ type: wer
112
+ value: 3.31
113
+ - task:
114
+ type: Automatic Speech Recognition
115
+ name: automatic-speech-recognition
116
+ dataset:
117
+ name: Mozilla Common Voice 16.1
118
+ type: mozilla-foundation/common_voice_16_1
119
+ config: fr
120
+ split: test
121
+ args:
122
+ language: fr
123
+ metrics:
124
+ - name: Test WER (Fr)
125
+ type: wer
126
+ value: 5.88
127
+ - task:
128
+ type: Automatic Speech Translation
129
+ name: automatic-speech-translation
130
+ dataset:
131
+ name: FLEURS
132
+ type: google/fleurs
133
+ config: en_us
134
+ split: test
135
+ args:
136
+ language: en-de
137
+ metrics:
138
+ - name: Test BLEU (En->De)
139
+ type: bleu
140
+ value: 32.27
141
+ - task:
142
+ type: Automatic Speech Translation
143
+ name: automatic-speech-translation
144
+ dataset:
145
+ name: FLEURS
146
+ type: google/fleurs
147
+ config: en_us
148
+ split: test
149
+ args:
150
+ language: en-de
151
+ metrics:
152
+ - name: Test BLEU (En->Es)
153
+ type: bleu
154
+ value: 22.6
155
+ - task:
156
+ type: Automatic Speech Translation
157
+ name: automatic-speech-translation
158
+ dataset:
159
+ name: FLEURS
160
+ type: google/fleurs
161
+ config: en_us
162
+ split: test
163
+ args:
164
+ language: en-de
165
+ metrics:
166
+ - name: Test BLEU (En->Fr)
167
+ type: bleu
168
+ value: 41.22
169
+ - task:
170
+ type: Automatic Speech Translation
171
+ name: automatic-speech-translation
172
+ dataset:
173
+ name: FLEURS
174
+ type: google/fleurs
175
+ config: de_de
176
+ split: test
177
+ args:
178
+ language: de-en
179
+ metrics:
180
+ - name: Test BLEU (De->En)
181
+ type: bleu
182
+ value: 35.5
183
+ - task:
184
+ type: Automatic Speech Translation
185
+ name: automatic-speech-translation
186
+ dataset:
187
+ name: FLEURS
188
+ type: google/fleurs
189
+ config: es_419
190
+ split: test
191
+ args:
192
+ language: es-en
193
+ metrics:
194
+ - name: Test BLEU (Es->En)
195
+ type: bleu
196
+ value: 23.32
197
+ - task:
198
+ type: Automatic Speech Translation
199
+ name: automatic-speech-translation
200
+ dataset:
201
+ name: FLEURS
202
+ type: google/fleurs
203
+ config: fr_fr
204
+ split: test
205
+ args:
206
+ language: fr-en
207
+ metrics:
208
+ - name: Test BLEU (Fr->En)
209
+ type: bleu
210
+ value: 33.42
211
+ - task:
212
+ type: Automatic Speech Translation
213
+ name: automatic-speech-translation
214
+ dataset:
215
+ name: COVOST
216
+ type: covost2
217
+ config: de_de
218
+ split: test
219
+ args:
220
+ language: de-en
221
+ metrics:
222
+ - name: Test BLEU (De->En)
223
+ type: bleu
224
+ value: 39.33
225
+ - task:
226
+ type: Automatic Speech Translation
227
+ name: automatic-speech-translation
228
+ dataset:
229
+ name: COVOST
230
+ type: covost2
231
+ config: es_419
232
+ split: test
233
+ args:
234
+ language: es-en
235
+ metrics:
236
+ - name: Test BLEU (Es->En)
237
+ type: bleu
238
+ value: 41.86
239
+ - task:
240
+ type: Automatic Speech Translation
241
+ name: automatic-speech-translation
242
+ dataset:
243
+ name: COVOST
244
+ type: covost2
245
+ config: fr_fr
246
+ split: test
247
+ args:
248
+ language: fr-en
249
+ metrics:
250
+ - name: Test BLEU (Fr->En)
251
+ type: bleu
252
+ value: 41.43
253
+
254
+ metrics:
255
+ - wer
256
+ - bleu
257
+ pipeline_tag: automatic-speech-recognition
258
+ ---
259
+
260
+ # Canary 1B Flash
261
+
262
+ <style>
263
+ img {
264
+ display: inline;
265
+ }
266
+ </style>
267
+
268
+ ## Description:
269
+ NVIDIA NeMo Canary [1] is a family of multilingual multi-tasking models that achieves state-of-the art performance on multiple speech benchmarks. With 883 million parameters and running at more then 900 RTFx (on open-asr-leaderboard sets), canary-1b-flash supports automatic speech-to-text recognition (ASR) in 4 languages (English, German, French, Spanish) and translation from English to German/French/Spanish and from German/French/Spanish to English with or without punctuation and capitalization (PnC). In addition to this, canary-1b-flash also supports functionality for word-level and segment-level timestamps for English, German, French, and Spanish. This model is released under the permissive CC-BY-4.0 license and is available for commercial use.
270
+
271
+
272
+ ## Model Architecture:
273
+ Canary is an encoder-decoder model with FastConformer [2] Encoder and Transformer Decoder [3]. With audio features extracted from the encoder, task tokens such as \<target language\>, \<task\>, \<toggle timestamps\> and \<toggle PnC\> are fed into the Transformer Decoder to trigger the text generation process. Canary uses a concatenated tokenizer [4] from individual SentencePiece [5] tokenizers of each language, which makes it easy to scale up to more languages. The canary-1b-flash model has 32 encoder layers and 4 decoder layers, leading to a total of 883M parameters. For more details about the architecture, please refer to [9].
274
+
275
+ ## NVIDIA NeMo
276
+
277
+ To train, fine-tune or transcribe with canary-1b-flash, you will need to install [NVIDIA NeMo](https://github.com/NVIDIA/NeMo).
278
+
279
+ ## How to Use this Model
280
+
281
+ The model is available for use in the NeMo toolkit [4], and can be used as a pre-trained checkpoint for inference or for fine-tuning on another dataset.
282
+
283
+ ### Loading the Model
284
+
285
+ ```python
286
+ from nemo.collections.asr.models import EncDecMultiTaskModel
287
+ # load model
288
+ canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b-flash')
289
+ # update dcode params
290
+ decode_cfg = canary_model.cfg.decoding
291
+ decode_cfg.beam.beam_size = 1
292
+ canary_model.change_decoding_strategy(decode_cfg)
293
+ ```
294
+
295
+ ## Input:
296
+ **Input Type(s):** Audio <br>
297
+ **Input Format(s):** .wav or .flac files<br>
298
+ **Input Parameters(s):** 1D <br>
299
+ **Other Properties Related to Input:** 16000 Hz Mono-channel Audio, Pre-Processing Not Needed <br>
300
+
301
+ Input to canary-1b-flash can be either a list of paths to audio files or a jsonl manifest file.
302
+
303
+ If the input is a list of paths, canary-1b-flash assumes that the audio is English and transcribes it. I.e., canary-1b-flash default behavior is English ASR.
304
+ ```python
305
+ output = canary_model.transcribe(
306
+ ['path1.wav', 'path2.wav'],
307
+ batch_size=16, # batch size to run the inference with
308
+ pnc=True, # generate output with Punctuation and Capitalization
309
+ )
310
+
311
+ predicted_text_1 = output[0].text
312
+
313
+ ```
314
+
315
+ canary-1b-flash can also generate word and segment level timestamps
316
+ ```python
317
+ output = canary_model.transcribe(
318
+ ['filepath.wav'],
319
+ timestamps='yes', # generate output with timestamps
320
+ )
321
+
322
+ predicted_text = output[0].text
323
+ word_level_timestamps = output[0].timestamp['word']
324
+ segment_level_timestamps = output[0].timestamp['segment']
325
+
326
+ ```
327
+
328
+ To use canary-1b-flash for transcribing other supported languages or perform Speech-to-Text translation or provide word-level timestamps, specify the input as jsonl manifest file, where each line in the file is a dictionary containing the following fields:
329
+
330
+ ```yaml
331
+ # Example of a line in input_manifest.json
332
+ {
333
+ "audio_filepath": "/path/to/audio.wav", # path to the audio file
334
+ "duration": 1000, # duration of the audio, can be set to `None` if using NeMo main branch
335
+ "taskname": "asr", # use "s2t_translation" for speech-to-text translation with r1.23, or "ast" if using the NeMo main branch
336
+ "source_lang": "en", # language of the audio input, set `source_lang`==`target_lang` for ASR, choices=['en','de','es','fr']
337
+ "target_lang": "en", # language of the text output, choices=['en','de','es','fr']
338
+ "pnc": "yes", # whether to have PnC output, choices=['yes', 'no']
339
+ "timestamp": "yes", # whether to output word-level timestamps, choices=['yes', 'no']
340
+ }
341
+ ```
342
+
343
+ and then use:
344
+ ```python
345
+ output = canary_model.transcribe(
346
+ "<path to input manifest file>",
347
+ batch_size=16, # batch size to run the inference with
348
+ )
349
+ ```
350
+
351
+ ## Output:
352
+ **Output Type(s):** Text <br>
353
+ **Output Format:** Text output as a string (w/ timestamps) depending on the task chosen for decoding <br>
354
+ **Output Parameters:** 1-Dimensional text string <br>
355
+ **Other Properties Related to Output:** May Need Inverse Text Normalization; Does Not Handle Special Characters <br>
356
+
357
+
358
+ ## Software Integration:
359
+ **Runtime Engine(s):**
360
+ * NeMo - 2.1.0 or higher <br>
361
+
362
+ **Supported Hardware Microarchitecture Compatibility:** <br>
363
+ * [NVIDIA Ampere] <br>
364
+ * [NVIDIA Blackwell] <br>
365
+ * [NVIDIA Jetson] <br>
366
+ * [NVIDIA Hopper] <br>
367
+ * [NVIDIA Lovelace] <br>
368
+ * [NVIDIA Pascal] <br>
369
+ * [NVIDIA Turing] <br>
370
+ * [NVIDIA Volta] <br>
371
+
372
+ **[Preferred/Supported] Operating System(s):** <br>
373
+ * [Linux] <br>
374
+ * [Linux 4 Tegra] <br>
375
+ * [Windows] <br>
376
+
377
+ ## Model Version(s):
378
+ canary-1b-flash <br>
379
+
380
+
381
+ # Training and Evaluation Datasets:
382
+
383
+ ## Training Dataset:
384
+
385
+ The canary-1b-flash model is trained on a total of 85K hrs of speech data. It consists of 31K hrs of public data, 20K hrs collected by [Suno](https://suno.ai/), and 34K hrs of in-house data.
386
+ The datasets below include conversations, videos from the web and audiobook recordings.
387
+
388
+ **Data Collection Method:**
389
+ * Human <br>
390
+
391
+ **Labeling Method:**
392
+ * Hybrid: Human, Automated <br>
393
+
394
+ The constituents of public data are as follows.
395
+
396
+ #### English (25.5k hours)
397
+ - Librispeech 960 hours
398
+ - Fisher Corpus
399
+ - Switchboard-1 Dataset
400
+ - WSJ-0 and WSJ-1
401
+ - National Speech Corpus (Part 1, Part 6)
402
+ - VCTK
403
+ - VoxPopuli (EN)
404
+ - Europarl-ASR (EN)
405
+ - Multilingual Librispeech (MLS EN) - 2,000 hour subset
406
+ - Mozilla Common Voice (v7.0)
407
+ - People's Speech - 12,000 hour subset
408
+ - Mozilla Common Voice (v11.0) - 1,474 hour subset
409
+
410
+ #### German (2.5k hours)
411
+ - Mozilla Common Voice (v12.0) - 800 hour subset
412
+ - Multilingual Librispeech (MLS DE) - 1,500 hour subset
413
+ - VoxPopuli (DE) - 200 hr subset
414
+
415
+ #### Spanish (1.4k hours)
416
+ - Mozilla Common Voice (v12.0) - 395 hour subset
417
+ - Multilingual Librispeech (MLS ES) - 780 hour subset
418
+ - VoxPopuli (ES) - 108 hour subset
419
+ - Fisher - 141 hour subset
420
+
421
+ #### French (1.8k hours)
422
+ - Mozilla Common Voice (v12.0) - 708 hour subset
423
+ - Multilingual Librispeech (MLS FR) - 926 hour subset
424
+ - VoxPopuli (FR) - 165 hour subset
425
+
426
+
427
+ ## Evaluation Dataset:
428
+
429
+ **Data Collection Method:** <br>
430
+ * Human <br>
431
+
432
+ **Labeling Method:** <br>
433
+ * Human <br>
434
+
435
+ Automatic Speech Recognition:
436
+ * [HuggingFace OpenASR Leaderboard evaluation sets](https://huggingface.co/spaces/hf-audio/open_asr_leaderboard)
437
+ * [MLS](https://huggingface.co/datasets/facebook/multilingual_librispeech)
438
+ * [MCV] (https://commonvoice.mozilla.org/en/datasets)
439
+
440
+ Automatic Speech Translation:
441
+ * [FLEURS](https://huggingface.co/datasets/google/fleurs)
442
+ * [COVOST-v2](https://github.com/facebookresearch/covost)
443
+ * [mExpresso](https://huggingface.co/facebook/seamless-expressive#mexpresso-multilingual-expresso)
444
+
445
+ Timestamp Prediction:
446
+ * [Librispeech](https://www.openslr.org/12)
447
+
448
+ Hallucination Robustness:
449
+ * [MUSAN](https://www.openslr.org/17/) 48 hrs eval set
450
+
451
+ Noise Robustness:
452
+ * [Librispeech](https://www.openslr.org/12)
453
+
454
+ Model Fairness:
455
+ * [Casual Conversations Dataset](https://arxiv.org/pdf/2104.02821)
456
+
457
+ ## Training
458
+
459
+ canary-1b-flash is trained using the NVIDIA NeMo toolkit [6] for a total of 200K steps with 2D bucketing [9] and optimal batch sizes set using OOMptimizer [7].The model is trained on 128 NVIDIA A100 80GB GPUs.
460
+ The model can be trained using this [example script](https://github.com/NVIDIA/NeMo/blob/main/examples/asr/speech_multitask/speech_to_text_aed.py) and [base config](https://github.com/NVIDIA/NeMo/blob/main/examples/asr/conf/speech_multitask/fast-conformer_aed.yaml).
461
+
462
+ The tokenizers for these models were built using the text transcripts of the train set with this [script](https://github.com/NVIDIA/NeMo/blob/main/scripts/tokenizers/process_asr_text_tokenizer.py).
463
+
464
+ ## Inference:
465
+ **Engine:** NVIDIA NeMo <br>
466
+ **Test Hardware :** <br>
467
+ * A6000 <br>
468
+ * A100 <br>
469
+ * V100 <br>
470
+
471
+ ## Performance
472
+
473
+ In both ASR and AST experiments, predictions were generated using beam search with width 5 and length penalty 1.0.
474
+
475
+ ### ASR Performance (w/o PnC)
476
+
477
+ The ASR performance is measured with word error rate (WER), and we process the groundtruth and predicted text with [whisper-normalizer](https://pypi.org/project/whisper-normalizer/).
478
+
479
+ WER on [HuggingFace OpenASR leaderboard](https://huggingface.co/spaces/hf-audio/open_asr_leaderboard):
480
+
481
+ | **Version** | **Model** | **RTFx** | **AMI** | **GigaSpeech** | **LS Clean** | **LS Other** | **Earnings22** | **SPGISpech** | **Tedlium** | **Voxpopuli** |
482
+ |:---------:|:-----------:|:------:|:------:|:------:|:------:|:------:|:------:|:------:|:------:|:------:|
483
+ | 2.2.0 | canary-1b-flash | 928.19 | 13.08 | 9.88 | 1.48 | 2.87 | 12.77 | 1.95 | 3.09 | 5.64 |
484
+
485
+ WER on [MLS](https://huggingface.co/datasets/facebook/multilingual_librispeech) test set:
486
+
487
+ | **Version** | **Model** | **De** | **Es** | **Fr** |
488
+ |:---------:|:-----------:|:------:|:------:|:------:|
489
+ | 2.2.0 | canary-1b-flash | 4.36 | 2.69 | 4.47 |
490
+
491
+ WER on [MCV-16.1](https://commonvoice.mozilla.org/en/datasets) test set:
492
+ | **Version** | **Model** | **En** | **De** | **Es** | **Fr** |
493
+ |:---------:|:-----------:|:------:|:------:|:------:|:------:|
494
+ | 2.2.0 | canary-1b-flash | 6.99 | 4.03 | 3.31 | 5.88 |
495
+
496
+
497
+ More details on evaluation can be found at [HuggingFace ASR Leaderboard](https://huggingface.co/spaces/hf-audio/open_asr_leaderboard)
498
+
499
+ ### AST Performance
500
+
501
+ We evaluate AST performance with [BLEU score](https://lightning.ai/docs/torchmetrics/stable/text/sacre_bleu_score.html), and use native annotations with punctuation and capitalization in the datasets.
502
+
503
+ BLEU score on [FLEURS](https://huggingface.co/datasets/google/fleurs) test set:
504
+
505
+ | **Version** | **Model** | **En->De** | **En->Es** | **En->Fr** | **De->En** | **Es->En** | **Fr->En** |
506
+ |:-----------:|:---------:|:----------:|:----------:|:----------:|:----------:|:----------:|:----------:|
507
+ | 2.2.0 | canary-1b-flash | 32.27 | 22.6 | 41.22 | 35.5 | 23.32 | 33.42 |
508
+
509
+
510
+ BLEU score on [COVOST-v2](https://github.com/facebookresearch/covost) test set:
511
+
512
+ | **Version** | **Model** | **De->En** | **Es->En** | **Fr->En** |
513
+ |:-----------:|:---------:|:----------:|:----------:|:----------:|
514
+ | 2.2.0 | canary-1b-flash | 39.33 | 41.86 | 41.43 |
515
+
516
+ BLEU score on [mExpresso](https://huggingface.co/facebook/seamless-expressive#mexpresso-multilingual-expresso) test set:
517
+
518
+ | **Version** | **Model** | **En->De** | **En->Es** | **En->Fr** |
519
+ |:-----------:|:---------:|:----------:|:----------:|:----------:|
520
+ | 2.2.0 | canary-1b-flash | 22.91 | 35.69 | 27.85 |
521
+
522
+ ### Timestamp Prediction
523
+ F1-score on [Librispeech Test sets](https://www.openslr.org/12) at collar value of 200ms
524
+ | **Version** | **Model** | **test-clean** | **test-other** |
525
+ |:-----------:|:---------:|:----------:|:----------:|
526
+ | 2.2.0 | canary-1b-flash | 95.5 | 93.5 |
527
+
528
+ ### Hallucination Robustness
529
+ Number of characters per minute on [MUSAN](https://www.openslr.org/17) 48 hrs eval set
530
+ | **Version** | **Model** | **# of character per minute** |
531
+ |:-----------:|:---------:|:----------:|
532
+ | 2.2.0 | canary-1b-flash | 60.92 |
533
+
534
+ ### Noise Robustness
535
+ WER on [Librispeech Test Clean](https://www.openslr.org/12) at different SNR (signal to noise ratio) levels of additive white noise
536
+
537
+ | **Version** | **Model** | **SNR 10** | **SNR 5** | **SNR 0** | **SNR -5** |
538
+ |:-----------:|:---------:|:----------:|:----------:|:----------:|:----------:|
539
+ | 2.2.0 | canary-1b-flash | 2.34 | 3.69 | 8.84 | 29.71 |
540
+
541
+ ## Model Fairness Evaluation
542
+
543
+ As outlined in the paper "Towards Measuring Fairness in AI: the Casual Conversations Dataset" [8], we assessed the canary-1b-flash model for fairness. The model was evaluated on the CausalConversations-v1 dataset, and the results are reported as follows:
544
+
545
+ ### Gender Bias:
546
+
547
+ | Gender | Male | Female | N/A | Other |
548
+ | :--- | :--- | :--- | :--- | :--- |
549
+ | Num utterances | 19325 | 24532 | 926 | 33 |
550
+ | % WER | 14.66 | 12.44 | 17.17 | 27.56 |
551
+
552
+ ### Age Bias:
553
+
554
+ | Age Group | (18-30) | (31-45) | (46-85) | (1-100) |
555
+ | :--- | :--- | :--- | :--- | :--- |
556
+ | Num utterances | 15956 | 14585 | 13349 | 43890 |
557
+ | % WER | 13.18 | 13.45 | 13.64 | 13.41 |
558
+
559
+ (Error rates for fairness evaluation are determined by normalizing both the reference and predicted text, similar to the methods used in the evaluations found at https://github.com/huggingface/open_asr_leaderboard.)
560
+
561
+ ## License/Terms of Use:
562
+ canary-1b-flash is released under the CC-BY-4.0 license. By using this model, you are agreeing to the [terms and conditions](https://choosealicense.com/licenses/cc-by-4.0/) of the license. <br>
563
+
564
+ ## References:
565
+ [1] [Less is More: Accurate Speech Recognition & Translation without Web-Scale Data](https://www.isca-archive.org/interspeech_2024/puvvada24_interspeech.pdf) <br>
566
+ [2] [Fast Conformer with Linearly Scalable Attention for Efficient Speech Recognition](https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=10389701)
567
+
568
+ [3] [Attention Is All You Need](https://arxiv.org/abs/1706.03762)
569
+
570
+ [4] [Unified Model for Code-Switching Speech Recognition and Language Identification Based on Concatenated Tokenizer](https://aclanthology.org/2023.calcs-1.7.pdf)
571
+
572
+ [5] [Google Sentencepiece Tokenizer](https://github.com/google/sentencepiece)
573
+
574
+ [6] [NVIDIA NeMo Toolkit](https://github.com/NVIDIA/NeMo)
575
+
576
+ [7] [EMMeTT: Efficient Multimodal Machine Translation Training](https://arxiv.org/abs/2409.13523)
577
+
578
+ [8] [Towards Measuring Fairness in AI: the Casual Conversations Dataset](https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=9634168)
579
+
580
+ [9] [Training and Inference Efficiency of Encoder-Decoder Speech Models](https://arxiv.org/pdf/2503.05931)
581
+
582
+ ## Ethical Considerations:
583
+ NVIDIA believes Trustworthy AI is a shared responsibility and we have established policies and practices to enable development for a wide array of AI applications. When downloaded or used in accordance with our terms of service, developers should work with their internal model team to ensure this model meets requirements for the relevant industry and use case and addresses unforeseen product misuse.
584
+ Please report security vulnerabilities or NVIDIA AI Concerns [here](https://www.nvidia.com/en-us/support/submit-security-vulnerability/).
585
+