File size: 4,911 Bytes
854cbad a9396f6 854cbad eea98ca 854cbad d69a18a eea98ca 854cbad cc4e759 854cbad cc4e759 854cbad b2756ff 854cbad 5121291 854cbad b2756ff 854cbad |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
---
language:
- tr
datasets:
- common_voice
metrics:
- wer
tags:
- audio
- automatic-speech-recognition
- speech
- xlsr-fine-tuning-week
license: apache-2.0
model-index:
- name: XLSR Wav2Vec2 Large Turkish by Gorkem Goknar
results:
- task:
name: Speech Recognition
type: automatic-speech-recognition
dataset:
name: Common Voice tr
type: common_voice
args: tr
metrics:
- name: Test WER
type: wer
value: TBD
---
# Wav2Vec2-Large-XLSR-53-Turkish
Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Turkish using the [Common Voice](https://huggingface.co/datasets/common_voice).
When using this model, make sure that your speech input is sampled at 16kHz.
## Usage
The model can be used directly (without a language model) as follows:
```python
import torch
import torchaudio
import pydub
from pydub.utils import mediainfo
import array
from pydub import AudioSegment
from pydub.utils import get_array_type
import numpy as np
from datasets import load_dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
test_dataset = load_dataset("common_voice", "tr", split="test[:2%]")
processor = Wav2Vec2Processor.from_pretrained("gorkemgoknar/wav2vec2-large-xlsr-53-turkish")
model = Wav2Vec2ForCTC.from_pretrained("gorkemgoknar/wav2vec2-large-xlsr-53-turkish")
def audio_resampler(batch,new_sample_rate = 16000):
##torchaudio and librosa troublesome to use for mp3 in windows
#speech_array, sampling_rate = torchaudio.load(batch["path"])
#speech_array, sampling_rate = librosa.load(batch["path"])
#AudioSegment does the job over ffmpeg(need install)
sound = AudioSegment.from_file(file=batch["path"])
sound = sound.set_frame_rate(new_sample_rate)
left = sound.split_to_mono()[0]
bit_depth = left.sample_width * 8
array_type = get_array_type(bit_depth)
numeric_array = np.array(array.array(array_type, left._data) )
#windows hack as torchaudio cannot read mp3
speech_array = torch.FloatTensor(numeric_array)
batch["speech"] = numeric_array
batch["sampling_rate"] = new_sample_rate
batch["target_text"] = batch["sentence"]
return batch
resampler = audio_resampler(16000)
# Preprocessing the datasets.
# We need to read the aduio files as arrays
def speech_file_to_array_fn(batch):
speech_array, sampling_rate = torchaudio.load(batch["path"])
batch["speech"] = resampler(speech_array).squeeze().numpy()
return batch
test_dataset = test_dataset.map(speech_file_to_array_fn)
inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
with torch.no_grad():
logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
predicted_ids = torch.argmax(logits, dim=-1)
print("Prediction:", processor.batch_decode(predicted_ids))
print("Reference:", test_dataset["sentence"][:2])
```
## Evaluation
The model can be evaluated as follows on the Turkish test data of Common Voice.
```python
import torch
import torchaudio
from datasets import load_dataset, load_metric
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import re
test_dataset = load_dataset("common_voice", "tr", split="test")
wer = load_metric("wer")
processor = Wav2Vec2Processor.from_pretrained("gorkemgoknar/wav2vec2-large-xlsr-53-turkish")
model = Wav2Vec2ForCTC.from_pretrained("gorkemgoknar/wav2vec2-large-xlsr-53-turkish")
model.to("cuda")
# Note: Not ignoring "'" on this one
chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\"\\“\\%\\‘\\”\\�\\#\\>\\<\\_\\’\\[\\]\\{\\}]'
resampler = torchaudio.transforms.Resample(48_000, 16_000)
# Preprocessing the datasets.
# We need to read the aduio files as arrays
def speech_file_to_array_fn(batch):
batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
speech_array, sampling_rate = torchaudio.load(batch["path"])
batch["speech"] = resampler(speech_array).squeeze().numpy()
return batch
test_dataset = test_dataset.map(speech_file_to_array_fn)
# Preprocessing the datasets.
# We need to read the aduio files as arrays
def evaluate(batch):
inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
with torch.no_grad():
logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
pred_ids = torch.argmax(logits, dim=-1)
batch["pred_strings"] = processor.batch_decode(pred_ids)
return batch
result = test_dataset.map(evaluate, batched=True, batch_size=8)
print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
```
**Test Result**: TBD %
## Training
The Common Voice `train` and `validation` datasets were used for training. Additional 5 Turkish movies with subtitles also used
|