kotoba-tech
/

kotoba-whisper-v1.0

Automatic Speech Recognition

hf-asr-leaderboard

Model card Files Files and versions

asahi417 commited on May 6, 2024

Commit

f65a4d4

·

verified ·

1 Parent(s): b20891c

Update README.md

Files changed (1) hide show

README.md +4 -2

README.md CHANGED Viewed

@@ -300,6 +300,7 @@ import torch
 from transformers import pipeline
 from datasets import load_dataset
 from evaluate import load
 # model config
 model_id = "kotoba-tech/kotoba-whisper-v1.0"
@@ -307,6 +308,7 @@ torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 model_kwargs = {"attn_implementation": "sdpa"} if torch.cuda.is_available() else {}
 generate_kwargs = {"language": "japanese", "task": "transcribe"}
 # data config
 dataset_name = "japanese-asr/ja_asr.reazonspeech_test"
@@ -326,8 +328,8 @@ pipe = pipeline(
 # load the dataset and sample the audio with 16kHz
 dataset = load_dataset(dataset_name, split="test")
 transcriptions = pipe(dataset['audio'])
-transcriptions = [i['text'].replace(" ", "") for i in transcriptions]
-references = [i.replace(" ", "") for i in dataset['transcription']]
 # compute the CER metric
 cer_metric = load("cer")

 from transformers import pipeline
 from datasets import load_dataset
 from evaluate import load
+from transformers.models.whisper.english_normalizer import BasicTextNormalizer
 # model config
 model_id = "kotoba-tech/kotoba-whisper-v1.0"
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 model_kwargs = {"attn_implementation": "sdpa"} if torch.cuda.is_available() else {}
 generate_kwargs = {"language": "japanese", "task": "transcribe"}
+normalizer = BasicTextNormalizer()
 # data config
 dataset_name = "japanese-asr/ja_asr.reazonspeech_test"
 # load the dataset and sample the audio with 16kHz
 dataset = load_dataset(dataset_name, split="test")
 transcriptions = pipe(dataset['audio'])
+transcriptions = [normalizer(i['text']).replace(" ", "") for i in transcriptions]
+references = [normalizer(i).replace(" ", "") for i in dataset['transcription']]
 # compute the CER metric
 cer_metric = load("cer")