gorkemgoknar
/

wav2vec2-large-xlsr-53-turkish

@@ -99,34 +99,105 @@ import torchaudio
 from datasets import load_dataset, load_metric
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 import re
 test_dataset = load_dataset("common_voice", "tr", split="test")
 wer = load_metric("wer")
 processor = Wav2Vec2Processor.from_pretrained("gorkemgoknar/wav2vec2-large-xlsr-53-turkish")
 model = Wav2Vec2ForCTC.from_pretrained("gorkemgoknar/wav2vec2-large-xlsr-53-turkish")
 model.to("cuda")
-# Note: Not ignoring "'"  on this one
-chars_to_ignore_regex = """[\,\?\.\!\-\;\:\"\“\%\‘\”\�\#\>\<\_\’\[\]\{\}]"""
-resampler = torchaudio.transforms.Resample(48_000, 16_000)
 # Preprocessing the datasets.
 # We need to read the aduio files as arrays
 def speech_file_to_array_fn(batch):
     batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
-    speech_array, sampling_rate = torchaudio.load(batch["path"])
-    batch["speech"] = resampler(speech_array).squeeze().numpy()
     return batch
 test_dataset = test_dataset.map(speech_file_to_array_fn)
 # Preprocessing the datasets.
 # We need to read the aduio files as arrays
 def evaluate(batch):
     inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
     with torch.no_grad():
         logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
     pred_ids = torch.argmax(logits, dim=-1)
     batch["pred_strings"] = processor.batch_decode(pred_ids)
     return batch
-result = test_dataset.map(evaluate, batched=True, batch_size=8)
 print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
 ```
 **Test Result**: TBD %
 ## Training

 from datasets import load_dataset, load_metric
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 import re
+import torch
+import pydub
+from pydub.utils import mediainfo
+import array
+from pydub import AudioSegment
+from pydub.utils import get_array_type
+import numpy as np
 test_dataset = load_dataset("common_voice", "tr", split="test")
 wer = load_metric("wer")
 processor = Wav2Vec2Processor.from_pretrained("gorkemgoknar/wav2vec2-large-xlsr-53-turkish")
 model = Wav2Vec2ForCTC.from_pretrained("gorkemgoknar/wav2vec2-large-xlsr-53-turkish")
 model.to("cuda")
+#Note: Not ignoring "'"  on this one
+#Note: Not ignoring "'"  on this one
+chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\#\>\<\_\’\[\]\{\}]'
+#resampler = torchaudio.transforms.Resample(48_000, 16_000)
+#using custom load and transformer for audio  -> see audio_resampler
+new_sample_rate = 16000
+import torchaudio
+import torch
+import pydub
+import array
+import numpy as np
+def audio_resampler(batch, new_sample_rate = 16000):
+    #not working without complex library compilation in windows for mp3
+    #speech_array, sampling_rate = torchaudio.load(batch["path"])
+    #speech_array, sampling_rate = librosa.load(batch["path"])
+    #sampling_rate =  pydub.utils.info['sample_rate']  ##gets current samplerate
+    sound = pydub.AudioSegment.from_file(file=batch["path"])
+    sampling_rate = new_sample_rate
+    sound = sound.set_frame_rate(new_sample_rate)
+    left = sound.split_to_mono()[0]
+    bit_depth = left.sample_width * 8
+    array_type = pydub.utils.get_array_type(bit_depth)
+    numeric_array = np.array(array.array(array_type, left._data) )
+    speech_array = torch.FloatTensor(numeric_array)
+    batch["speech"] = numeric_array
+    batch["sampling_rate"] = sampling_rate
+    #batch["target_text"] = batch["sentence"]
+    return batch
+def remove_special_characters(batch):
+    ##this one comes from subtitles if additional timestamps not processed  -> 00:01:01   00:01:01,33
+    batch["sentence"] = re.sub('\b\d{2}:\d{2}:\d{2}(,+\d{2})?\b', ' ', batch["sentence"])
+    ##remove all caps in text [AÇIKLAMA] etc, do it before..
+    batch["sentence"] = re.sub('\[(\b[A-Z]+\])', '', batch["sentence"])
+    ##replace three dots (that are inside string with single)
+    batch["sentence"] = re.sub("([a-zA-Z]+)\.\.\.", r"\1.", batch["sentence"])
+    #standart ignore list
+    batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
+    return batch
 # Preprocessing the datasets.
 # We need to read the aduio files as arrays
 def speech_file_to_array_fn(batch):
     batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
+    ##speech_array, sampling_rate = torchaudio.load(batch["path"])
+    ##load and conversion done in resampler , takes and returns batch
+    batch = audio_resampler(batch, new_sample_rate = new_sample_rate)
     return batch
 test_dataset = test_dataset.map(speech_file_to_array_fn)
 # Preprocessing the datasets.
 # We need to read the aduio files as arrays
 def evaluate(batch):
     inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
     with torch.no_grad():
         logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
     pred_ids = torch.argmax(logits, dim=-1)
     batch["pred_strings"] = processor.batch_decode(pred_ids)
     return batch
+print("EVALUATING:")
+##for 8GB RAM on GPU best is batch_size 2 for windows,  4 may fit in linux only
+result = test_dataset.map(evaluate, batched=True, batch_size=2)
 print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
 ```
 **Test Result**: TBD %
 ## Training