Commit
·
0dd7f50
1
Parent(s):
96e034b
Update README.md
Browse files
README.md
CHANGED
@@ -120,7 +120,8 @@ model.to("cuda")
|
|
120 |
|
121 |
#Note: Not ignoring "'" on this one
|
122 |
#Note: Not ignoring "'" on this one
|
123 |
-
chars_to_ignore_regex =
|
|
|
124 |
|
125 |
#resampler = torchaudio.transforms.Resample(48_000, 16_000)
|
126 |
#using custom load and transformer for audio -> see audio_resampler
|
@@ -151,23 +152,22 @@ def audio_resampler(batch, new_sample_rate = 16000):
|
|
151 |
|
152 |
return batch
|
153 |
|
154 |
-
|
155 |
def remove_special_characters(batch):
|
156 |
|
157 |
##this one comes from subtitles if additional timestamps not processed -> 00:01:01 00:01:01,33
|
158 |
-
batch["sentence"] = re.sub('
|
159 |
|
160 |
##remove all caps in text [AÇIKLAMA] etc, do it before..
|
161 |
-
batch["sentence"] = re.sub('
|
162 |
|
163 |
##replace three dots (that are inside string with single)
|
164 |
-
batch["sentence"] = re.sub("([a-zA-Z]+)
|
165 |
|
166 |
#standart ignore list
|
167 |
batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
|
168 |
-
|
169 |
|
170 |
return batch
|
|
|
171 |
|
172 |
# Preprocessing the datasets.
|
173 |
# We need to read the aduio files as arrays
|
|
|
120 |
|
121 |
#Note: Not ignoring "'" on this one
|
122 |
#Note: Not ignoring "'" on this one
|
123 |
+
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\#\>\<\_\’\[\]\{\}]'
|
124 |
+
|
125 |
|
126 |
#resampler = torchaudio.transforms.Resample(48_000, 16_000)
|
127 |
#using custom load and transformer for audio -> see audio_resampler
|
|
|
152 |
|
153 |
return batch
|
154 |
|
|
|
155 |
def remove_special_characters(batch):
|
156 |
|
157 |
##this one comes from subtitles if additional timestamps not processed -> 00:01:01 00:01:01,33
|
158 |
+
batch["sentence"] = re.sub('\b\d{2}:\d{2}:\d{2}(,+\d{2})?\b', ' ', batch["sentence"])
|
159 |
|
160 |
##remove all caps in text [AÇIKLAMA] etc, do it before..
|
161 |
+
batch["sentence"] = re.sub('\[(\b[A-Z]+\])', '', batch["sentence"])
|
162 |
|
163 |
##replace three dots (that are inside string with single)
|
164 |
+
batch["sentence"] = re.sub("([a-zA-Z]+)\.\.\.", r"\1.", batch["sentence"])
|
165 |
|
166 |
#standart ignore list
|
167 |
batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
|
|
|
168 |
|
169 |
return batch
|
170 |
+
|
171 |
|
172 |
# Preprocessing the datasets.
|
173 |
# We need to read the aduio files as arrays
|