gorkemgoknar
/

wav2vec2-large-xlsr-53-turkish

@@ -28,7 +28,7 @@ model-index:
 ---
 # Wav2Vec2-Large-XLSR-53-Turkish
-Note: Common voice Turkish data is no background noise voice only dataset.
 In this model although Word Error rate for test is 50% it is agains Common Voice text.
 Please try speech yourself and see it is converting pretty good .
@@ -120,7 +120,7 @@ model.to("cuda")
 #Note: Not ignoring "'"  on this one
 #Note: Not ignoring "'"  on this one
-chars_to_ignore_regex = """[\,\?\.\!\-\;\:\"\“\%\‘\”\�\#\>\<\_\’\[\]\{\}]"""
 #resampler = torchaudio.transforms.Resample(48_000, 16_000)
 #using custom load and transformer for audio  -> see audio_resampler
@@ -155,13 +155,13 @@ def audio_resampler(batch, new_sample_rate = 16000):
 def remove_special_characters(batch):
     ##this one comes from subtitles if additional timestamps not processed  -> 00:01:01   00:01:01,33
-    batch["sentence"] = re.sub('\\\\\\\\\\\\\\\\b\\\\\\\\\\\\\\\\d{2}:\\\\\\\\\\\\\\\\d{2}:\\\\\\\\\\\\\\\\d{2}(,+\\\\\\\\\\\\\\\\d{2})?\\\\\\\\\\\\\\\\b', ' ', batch["sentence"])
     ##remove all caps in text [AÇIKLAMA] etc, do it before..
-    batch["sentence"] = re.sub('\\\\\\\\\\\\\\\\[(\\\\\\\\\\\\\\\\b[A-Z]+\\\\\\\\\\\\\\\\])', '', batch["sentence"])
     ##replace three dots (that are inside string with single)
-    batch["sentence"] = re.sub("([a-zA-Z]+)\\\\\\\\\\\\\\\\.\\\\\\\\\\\\\\\\.\\\\\\\\\\\\\\\\.", r"\\\\\\\\\\\\\\\\1.", batch["sentence"])
     #standart ignore list
     batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "

 ---
 # Wav2Vec2-Large-XLSR-53-Turkish
+Note: Common voice Turkish data is no background noise voice only, slower than usual day speech dataset.
 In this model although Word Error rate for test is 50% it is agains Common Voice text.
 Please try speech yourself and see it is converting pretty good .
 #Note: Not ignoring "'"  on this one
 #Note: Not ignoring "'"  on this one
+chars_to_ignore_regex = """[\\,\\?\\.\\!\\-\\;\\:\\"\\“\\%\\‘\\”\\�\\#\\>\\<\\_\\’\\[\\]\\{\\}]"""
 #resampler = torchaudio.transforms.Resample(48_000, 16_000)
 #using custom load and transformer for audio  -> see audio_resampler
 def remove_special_characters(batch):
     ##this one comes from subtitles if additional timestamps not processed  -> 00:01:01   00:01:01,33
+    batch["sentence"] = re.sub('\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\b\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\d{2}:\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\d{2}:\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\d{2}(,+\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\d{2})?\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\b', ' ', batch["sentence"])
     ##remove all caps in text [AÇIKLAMA] etc, do it before..
+    batch["sentence"] = re.sub('\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\[(\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\b[A-Z]+\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\])', '', batch["sentence"])
     ##replace three dots (that are inside string with single)
+    batch["sentence"] = re.sub("([a-zA-Z]+)\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\.\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\.\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\.", r"\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\1.", batch["sentence"])
     #standart ignore list
     batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "