gorkemgoknar commited on
Commit
96e034b
·
1 Parent(s): a8cb69e

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +5 -5
README.md CHANGED
@@ -28,7 +28,7 @@ model-index:
28
  ---
29
  # Wav2Vec2-Large-XLSR-53-Turkish
30
 
31
- Note: Common voice Turkish data is no background noise voice only dataset.
32
  In this model although Word Error rate for test is 50% it is agains Common Voice text.
33
 
34
  Please try speech yourself and see it is converting pretty good .
@@ -120,7 +120,7 @@ model.to("cuda")
120
 
121
  #Note: Not ignoring "'" on this one
122
  #Note: Not ignoring "'" on this one
123
- chars_to_ignore_regex = """[\,\?\.\!\-\;\:\"\“\%\‘\”\�\#\>\<\_\’\[\]\{\}]"""
124
 
125
  #resampler = torchaudio.transforms.Resample(48_000, 16_000)
126
  #using custom load and transformer for audio -> see audio_resampler
@@ -155,13 +155,13 @@ def audio_resampler(batch, new_sample_rate = 16000):
155
  def remove_special_characters(batch):
156
 
157
  ##this one comes from subtitles if additional timestamps not processed -> 00:01:01 00:01:01,33
158
- batch["sentence"] = re.sub('\\\\\\\\\\\\\\\\b\\\\\\\\\\\\\\\\d{2}:\\\\\\\\\\\\\\\\d{2}:\\\\\\\\\\\\\\\\d{2}(,+\\\\\\\\\\\\\\\\d{2})?\\\\\\\\\\\\\\\\b', ' ', batch["sentence"])
159
 
160
  ##remove all caps in text [AÇIKLAMA] etc, do it before..
161
- batch["sentence"] = re.sub('\\\\\\\\\\\\\\\\[(\\\\\\\\\\\\\\\\b[A-Z]+\\\\\\\\\\\\\\\\])', '', batch["sentence"])
162
 
163
  ##replace three dots (that are inside string with single)
164
- batch["sentence"] = re.sub("([a-zA-Z]+)\\\\\\\\\\\\\\\\.\\\\\\\\\\\\\\\\.\\\\\\\\\\\\\\\\.", r"\\\\\\\\\\\\\\\\1.", batch["sentence"])
165
 
166
  #standart ignore list
167
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
 
28
  ---
29
  # Wav2Vec2-Large-XLSR-53-Turkish
30
 
31
+ Note: Common voice Turkish data is no background noise voice only, slower than usual day speech dataset.
32
  In this model although Word Error rate for test is 50% it is agains Common Voice text.
33
 
34
  Please try speech yourself and see it is converting pretty good .
 
120
 
121
  #Note: Not ignoring "'" on this one
122
  #Note: Not ignoring "'" on this one
123
+ chars_to_ignore_regex = """[\\,\\?\\.\\!\\-\\;\\:\\"\\“\\%\\‘\\”\\�\\#\\>\\<\\_\\’\\[\\]\\{\\}]"""
124
 
125
  #resampler = torchaudio.transforms.Resample(48_000, 16_000)
126
  #using custom load and transformer for audio -> see audio_resampler
 
155
  def remove_special_characters(batch):
156
 
157
  ##this one comes from subtitles if additional timestamps not processed -> 00:01:01 00:01:01,33
158
+ batch["sentence"] = re.sub('\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\b\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\d{2}:\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\d{2}:\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\d{2}(,+\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\d{2})?\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\b', ' ', batch["sentence"])
159
 
160
  ##remove all caps in text [AÇIKLAMA] etc, do it before..
161
+ batch["sentence"] = re.sub('\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\[(\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\b[A-Z]+\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\])', '', batch["sentence"])
162
 
163
  ##replace three dots (that are inside string with single)
164
+ batch["sentence"] = re.sub("([a-zA-Z]+)\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\.\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\.\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\.", r"\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\1.", batch["sentence"])
165
 
166
  #standart ignore list
167
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "