Spaces:
Running
Running
Vineel Pratap
commited on
Commit
·
f138a14
1
Parent(s):
a7567f9
resampling fix
Browse files- app.py +2 -1
- requirements.txt +2 -2
- utils/lm.py +9 -11
- utils/norm_config.py +8 -9
- utils/text_norm.py +15 -6
- zeroshot.py +9 -8
app.py
CHANGED
|
@@ -53,7 +53,8 @@ with gr.Blocks(css="style.css") as demo:
|
|
| 53 |
)
|
| 54 |
with gr.Column():
|
| 55 |
autolm = gr.Checkbox(
|
| 56 |
-
label="Automatically create Unigram LM from text data",
|
|
|
|
| 57 |
)
|
| 58 |
btn = gr.Button("Submit", elem_id="submit")
|
| 59 |
|
|
|
|
| 53 |
)
|
| 54 |
with gr.Column():
|
| 55 |
autolm = gr.Checkbox(
|
| 56 |
+
label="Automatically create Unigram LM from text data",
|
| 57 |
+
value=True,
|
| 58 |
)
|
| 59 |
btn = gr.Button("Submit", elem_id="submit")
|
| 60 |
|
requirements.txt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:41b8b278a5c4d2fc182c7893bcc683ad261ab0612cea1da58aaed1b358fd9649
|
| 3 |
+
size 164
|
utils/lm.py
CHANGED
|
@@ -1,14 +1,15 @@
|
|
| 1 |
-
# Creates unigram LM following KenLM
|
| 2 |
-
import math
|
| 3 |
import shutil, tempfile
|
| 4 |
|
|
|
|
| 5 |
def calculate_log_probabilities(word_counts, num_sentences, n_smoothing=0.01):
|
| 6 |
"""
|
| 7 |
Calculate log probabilities for each word in the corpus,
|
| 8 |
including a special <unk> token for unknown words.
|
| 9 |
"""
|
| 10 |
-
total_words = sum(word_counts.values())
|
| 11 |
-
total_words += 2 * num_sentences
|
| 12 |
# Adjust total for <unk>
|
| 13 |
total_words_with_unk = total_words + 1 # Adding 1 for <unk>
|
| 14 |
total_words_with_unk = total_words_with_unk + total_words_with_unk * n_smoothing
|
|
@@ -25,6 +26,7 @@ def calculate_log_probabilities(word_counts, num_sentences, n_smoothing=0.01):
|
|
| 25 |
# Convert to log probabilities
|
| 26 |
return {word: math.log10(prob) for word, prob in probabilities.items()}
|
| 27 |
|
|
|
|
| 28 |
def maybe_generate_pseudo_bigram_arpa(arpa_fpath):
|
| 29 |
with open(arpa_fpath, "r") as file:
|
| 30 |
lines = file.readlines()
|
|
@@ -46,6 +48,7 @@ def maybe_generate_pseudo_bigram_arpa(arpa_fpath):
|
|
| 46 |
|
| 47 |
file.write(line)
|
| 48 |
|
|
|
|
| 49 |
def save_log_probabilities(log_probabilities, file_path):
|
| 50 |
with open(file_path, "w") as file:
|
| 51 |
file.write(f"\data\\")
|
|
@@ -59,13 +62,8 @@ def save_log_probabilities(log_probabilities, file_path):
|
|
| 59 |
file.write(f"{log_prob}\t{word}\n")
|
| 60 |
file.write(f"\n")
|
| 61 |
file.write(f"\end\\")
|
| 62 |
-
|
|
|
|
| 63 |
def create_unigram_lm(word_counts, num_sentences, file_path, n_smoothing=0.01):
|
| 64 |
log_probs = calculate_log_probabilities(word_counts, num_sentences, n_smoothing)
|
| 65 |
save_log_probabilities(log_probs, file_path)
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
|
|
|
| 1 |
+
# Creates unigram LM following KenLM
|
| 2 |
+
import math
|
| 3 |
import shutil, tempfile
|
| 4 |
|
| 5 |
+
|
| 6 |
def calculate_log_probabilities(word_counts, num_sentences, n_smoothing=0.01):
|
| 7 |
"""
|
| 8 |
Calculate log probabilities for each word in the corpus,
|
| 9 |
including a special <unk> token for unknown words.
|
| 10 |
"""
|
| 11 |
+
total_words = sum(word_counts.values())
|
| 12 |
+
total_words += 2 * num_sentences # add counts for <s> and </s>
|
| 13 |
# Adjust total for <unk>
|
| 14 |
total_words_with_unk = total_words + 1 # Adding 1 for <unk>
|
| 15 |
total_words_with_unk = total_words_with_unk + total_words_with_unk * n_smoothing
|
|
|
|
| 26 |
# Convert to log probabilities
|
| 27 |
return {word: math.log10(prob) for word, prob in probabilities.items()}
|
| 28 |
|
| 29 |
+
|
| 30 |
def maybe_generate_pseudo_bigram_arpa(arpa_fpath):
|
| 31 |
with open(arpa_fpath, "r") as file:
|
| 32 |
lines = file.readlines()
|
|
|
|
| 48 |
|
| 49 |
file.write(line)
|
| 50 |
|
| 51 |
+
|
| 52 |
def save_log_probabilities(log_probabilities, file_path):
|
| 53 |
with open(file_path, "w") as file:
|
| 54 |
file.write(f"\data\\")
|
|
|
|
| 62 |
file.write(f"{log_prob}\t{word}\n")
|
| 63 |
file.write(f"\n")
|
| 64 |
file.write(f"\end\\")
|
| 65 |
+
|
| 66 |
+
|
| 67 |
def create_unigram_lm(word_counts, num_sentences, file_path, n_smoothing=0.01):
|
| 68 |
log_probs = calculate_log_probabilities(word_counts, num_sentences, n_smoothing)
|
| 69 |
save_log_probabilities(log_probs, file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/norm_config.py
CHANGED
|
@@ -42,7 +42,7 @@ inverted_question_mark = r"\u00BF"
|
|
| 42 |
|
| 43 |
|
| 44 |
# Hindi
|
| 45 |
-
hindi_danda =
|
| 46 |
|
| 47 |
# Egyptian Arabic
|
| 48 |
# arabic_percent = r"\u066A"
|
|
@@ -175,7 +175,7 @@ nominal_digit_shapes = r"\u206f"
|
|
| 175 |
with open(f"{os.path.dirname(__file__)}/punctuations.lst", "r") as punc_f:
|
| 176 |
punc_list = punc_f.readlines()
|
| 177 |
|
| 178 |
-
punct_pattern = r""
|
| 179 |
for punc in punc_list:
|
| 180 |
# the first character in the tab separated line is the punc to be removed
|
| 181 |
punct_pattern += re.escape(punc.split("\t")[0])
|
|
@@ -213,7 +213,6 @@ shared_punc_list = (
|
|
| 213 |
+ arabic_question_mark
|
| 214 |
+ chinese_punc
|
| 215 |
+ punct_pattern
|
| 216 |
-
|
| 217 |
)
|
| 218 |
|
| 219 |
shared_mappping = {
|
|
@@ -242,11 +241,11 @@ norm_config = {
|
|
| 242 |
"mapping": shared_mappping,
|
| 243 |
"digit_set": shared_digits,
|
| 244 |
"unicode_norm": "NFKC",
|
| 245 |
-
"rm_diacritics"
|
| 246 |
}
|
| 247 |
}
|
| 248 |
|
| 249 |
-
|
| 250 |
|
| 251 |
norm_config["mon"] = norm_config["*"].copy()
|
| 252 |
# add soft hyphen to punc list to match with fleurs
|
|
@@ -254,23 +253,23 @@ norm_config["mon"]["del_set"] += r"\u00AD"
|
|
| 254 |
|
| 255 |
norm_config["khk"] = norm_config["mon"].copy()
|
| 256 |
|
| 257 |
-
|
| 258 |
|
| 259 |
norm_config["heb"] = norm_config["*"].copy()
|
| 260 |
# add "HEBREW POINT" symbols to match with fleurs
|
| 261 |
norm_config["heb"]["del_set"] += r"\u05B0-\u05BF\u05C0-\u05CF"
|
| 262 |
|
| 263 |
-
|
| 264 |
|
| 265 |
norm_config["tha"] = norm_config["*"].copy()
|
| 266 |
# add "Zero width joiner" symbols to match with fleurs
|
| 267 |
norm_config["tha"]["punc_set"] += r"\u200D"
|
| 268 |
|
| 269 |
-
|
| 270 |
norm_config["ara"] = norm_config["*"].copy()
|
| 271 |
norm_config["ara"]["mapping"]["ٱ"] = "ا"
|
| 272 |
norm_config["arb"] = norm_config["ara"].copy()
|
| 273 |
|
| 274 |
-
|
| 275 |
norm_config["jav"] = norm_config["*"].copy()
|
| 276 |
norm_config["jav"]["rm_diacritics"] = True
|
|
|
|
| 42 |
|
| 43 |
|
| 44 |
# Hindi
|
| 45 |
+
hindi_danda = "\u0964"
|
| 46 |
|
| 47 |
# Egyptian Arabic
|
| 48 |
# arabic_percent = r"\u066A"
|
|
|
|
| 175 |
with open(f"{os.path.dirname(__file__)}/punctuations.lst", "r") as punc_f:
|
| 176 |
punc_list = punc_f.readlines()
|
| 177 |
|
| 178 |
+
punct_pattern = r""
|
| 179 |
for punc in punc_list:
|
| 180 |
# the first character in the tab separated line is the punc to be removed
|
| 181 |
punct_pattern += re.escape(punc.split("\t")[0])
|
|
|
|
| 213 |
+ arabic_question_mark
|
| 214 |
+ chinese_punc
|
| 215 |
+ punct_pattern
|
|
|
|
| 216 |
)
|
| 217 |
|
| 218 |
shared_mappping = {
|
|
|
|
| 241 |
"mapping": shared_mappping,
|
| 242 |
"digit_set": shared_digits,
|
| 243 |
"unicode_norm": "NFKC",
|
| 244 |
+
"rm_diacritics": False,
|
| 245 |
}
|
| 246 |
}
|
| 247 |
|
| 248 |
+
# =============== Mongolian ===============#
|
| 249 |
|
| 250 |
norm_config["mon"] = norm_config["*"].copy()
|
| 251 |
# add soft hyphen to punc list to match with fleurs
|
|
|
|
| 253 |
|
| 254 |
norm_config["khk"] = norm_config["mon"].copy()
|
| 255 |
|
| 256 |
+
# =============== Hebrew ===============#
|
| 257 |
|
| 258 |
norm_config["heb"] = norm_config["*"].copy()
|
| 259 |
# add "HEBREW POINT" symbols to match with fleurs
|
| 260 |
norm_config["heb"]["del_set"] += r"\u05B0-\u05BF\u05C0-\u05CF"
|
| 261 |
|
| 262 |
+
# =============== Thai ===============#
|
| 263 |
|
| 264 |
norm_config["tha"] = norm_config["*"].copy()
|
| 265 |
# add "Zero width joiner" symbols to match with fleurs
|
| 266 |
norm_config["tha"]["punc_set"] += r"\u200D"
|
| 267 |
|
| 268 |
+
# =============== Arabic ===============#
|
| 269 |
norm_config["ara"] = norm_config["*"].copy()
|
| 270 |
norm_config["ara"]["mapping"]["ٱ"] = "ا"
|
| 271 |
norm_config["arb"] = norm_config["ara"].copy()
|
| 272 |
|
| 273 |
+
# =============== Javanese ===============#
|
| 274 |
norm_config["jav"] = norm_config["*"].copy()
|
| 275 |
norm_config["jav"]["rm_diacritics"] = True
|
utils/text_norm.py
CHANGED
|
@@ -5,7 +5,9 @@ import unicodedata
|
|
| 5 |
from utils.norm_config import norm_config
|
| 6 |
|
| 7 |
|
| 8 |
-
def text_normalize(
|
|
|
|
|
|
|
| 9 |
|
| 10 |
"""Given a text, normalize it by changing to lower case, removing punctuations, removing words that only contain digits and removing extra spaces
|
| 11 |
|
|
@@ -15,17 +17,23 @@ def text_normalize(text, iso_code, lower_case=True, remove_numbers=True, remove_
|
|
| 15 |
remove_numbers : Boolean flag to specify if words containing only digits should be removed
|
| 16 |
|
| 17 |
Returns:
|
| 18 |
-
normalized_text : the string after all normalization
|
| 19 |
|
| 20 |
"""
|
| 21 |
|
| 22 |
config = norm_config.get(iso_code, norm_config["*"])
|
| 23 |
|
| 24 |
-
for field in [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
if field not in config:
|
| 26 |
config[field] = norm_config["*"][field]
|
| 27 |
|
| 28 |
-
|
| 29 |
text = unicodedata.normalize(config["unicode_norm"], text)
|
| 30 |
|
| 31 |
# Convert to lower case
|
|
@@ -34,7 +42,7 @@ def text_normalize(text, iso_code, lower_case=True, remove_numbers=True, remove_
|
|
| 34 |
text = text.lower()
|
| 35 |
|
| 36 |
# brackets
|
| 37 |
-
|
| 38 |
# always text inside brackets with numbers in them. Usually corresponds to "(Sam 23:17)"
|
| 39 |
text = re.sub(r"\([^\)]*\d[^\)]*\)", " ", text)
|
| 40 |
if remove_brackets:
|
|
@@ -84,9 +92,10 @@ def text_normalize(text, iso_code, lower_case=True, remove_numbers=True, remove_
|
|
| 84 |
|
| 85 |
if config["rm_diacritics"]:
|
| 86 |
from unidecode import unidecode
|
|
|
|
| 87 |
normalized_text = unidecode(normalized_text)
|
| 88 |
|
| 89 |
# Remove extra spaces
|
| 90 |
normalized_text = re.sub(r"\s+", " ", normalized_text).strip()
|
| 91 |
|
| 92 |
-
return normalized_text
|
|
|
|
| 5 |
from utils.norm_config import norm_config
|
| 6 |
|
| 7 |
|
| 8 |
+
def text_normalize(
|
| 9 |
+
text, iso_code, lower_case=True, remove_numbers=True, remove_brackets=False
|
| 10 |
+
):
|
| 11 |
|
| 12 |
"""Given a text, normalize it by changing to lower case, removing punctuations, removing words that only contain digits and removing extra spaces
|
| 13 |
|
|
|
|
| 17 |
remove_numbers : Boolean flag to specify if words containing only digits should be removed
|
| 18 |
|
| 19 |
Returns:
|
| 20 |
+
normalized_text : the string after all normalization
|
| 21 |
|
| 22 |
"""
|
| 23 |
|
| 24 |
config = norm_config.get(iso_code, norm_config["*"])
|
| 25 |
|
| 26 |
+
for field in [
|
| 27 |
+
"lower_case",
|
| 28 |
+
"punc_set",
|
| 29 |
+
"del_set",
|
| 30 |
+
"mapping",
|
| 31 |
+
"digit_set",
|
| 32 |
+
"unicode_norm",
|
| 33 |
+
]:
|
| 34 |
if field not in config:
|
| 35 |
config[field] = norm_config["*"][field]
|
| 36 |
|
|
|
|
| 37 |
text = unicodedata.normalize(config["unicode_norm"], text)
|
| 38 |
|
| 39 |
# Convert to lower case
|
|
|
|
| 42 |
text = text.lower()
|
| 43 |
|
| 44 |
# brackets
|
| 45 |
+
|
| 46 |
# always text inside brackets with numbers in them. Usually corresponds to "(Sam 23:17)"
|
| 47 |
text = re.sub(r"\([^\)]*\d[^\)]*\)", " ", text)
|
| 48 |
if remove_brackets:
|
|
|
|
| 92 |
|
| 93 |
if config["rm_diacritics"]:
|
| 94 |
from unidecode import unidecode
|
| 95 |
+
|
| 96 |
normalized_text = unidecode(normalized_text)
|
| 97 |
|
| 98 |
# Remove extra spaces
|
| 99 |
normalized_text = re.sub(r"\s+", " ", normalized_text).strip()
|
| 100 |
|
| 101 |
+
return normalized_text
|
zeroshot.py
CHANGED
|
@@ -34,7 +34,7 @@ class MY_LOG:
|
|
| 34 |
def __init__(self):
|
| 35 |
self.text = "[START]"
|
| 36 |
|
| 37 |
-
def add(self, new_log, new_line=
|
| 38 |
self.text = self.text + ("\n" if new_line else " ") + new_log
|
| 39 |
self.text = self.text.strip()
|
| 40 |
return self.text
|
|
@@ -127,7 +127,9 @@ def process(
|
|
| 127 |
audio_samples = (audio_samples / 32768.0).astype(float)
|
| 128 |
|
| 129 |
if sr != ASR_SAMPLING_RATE:
|
| 130 |
-
audio_samples = librosa.resample(
|
|
|
|
|
|
|
| 131 |
else:
|
| 132 |
# file upload
|
| 133 |
assert isinstance(audio_data, str)
|
|
@@ -179,15 +181,14 @@ def process(
|
|
| 179 |
# print(k, v)
|
| 180 |
yield transcription, logs.add(f"Leixcon size: {len(lexicon)}")
|
| 181 |
|
| 182 |
-
# Input could be sentences OR list of words. Check if atleast one word has a count > 1 to diffentiate
|
| 183 |
-
tmp_file = tempfile.NamedTemporaryFile()
|
| 184 |
if autolm and any([cnt > 2 for cnt in word_counts.values()]):
|
| 185 |
yield transcription, logs.add(f"Creating unigram LM...", False)
|
| 186 |
-
lm_path = tmp_file.name
|
| 187 |
create_unigram_lm(word_counts, num_sentences, lm_path)
|
| 188 |
yield transcription, logs.add(f"OK")
|
| 189 |
|
| 190 |
-
|
| 191 |
if lm_path is None:
|
| 192 |
yield transcription, logs.add(f"Filtering lexicon....")
|
| 193 |
lexicon = filter_lexicon(lexicon, word_counts)
|
|
@@ -195,8 +196,8 @@ def process(
|
|
| 195 |
f"Ok. Leixcon size after filtering: {len(lexicon)}"
|
| 196 |
)
|
| 197 |
else:
|
| 198 |
-
# kenlm throws an error if unigram LM is being used
|
| 199 |
-
# HACK: generate a bigram LM from unigram LM and a dummy bigram to trick it
|
| 200 |
maybe_generate_pseudo_bigram_arpa(lm_path)
|
| 201 |
|
| 202 |
# for k, v in lexicon.items():
|
|
|
|
| 34 |
def __init__(self):
|
| 35 |
self.text = "[START]"
|
| 36 |
|
| 37 |
+
def add(self, new_log, new_line=True):
|
| 38 |
self.text = self.text + ("\n" if new_line else " ") + new_log
|
| 39 |
self.text = self.text.strip()
|
| 40 |
return self.text
|
|
|
|
| 127 |
audio_samples = (audio_samples / 32768.0).astype(float)
|
| 128 |
|
| 129 |
if sr != ASR_SAMPLING_RATE:
|
| 130 |
+
audio_samples = librosa.resample(
|
| 131 |
+
audio_samples, orig_sr=sr, target_sr=ASR_SAMPLING_RATE
|
| 132 |
+
)
|
| 133 |
else:
|
| 134 |
# file upload
|
| 135 |
assert isinstance(audio_data, str)
|
|
|
|
| 181 |
# print(k, v)
|
| 182 |
yield transcription, logs.add(f"Leixcon size: {len(lexicon)}")
|
| 183 |
|
| 184 |
+
# Input could be sentences OR list of words. Check if atleast one word has a count > 1 to diffentiate
|
| 185 |
+
tmp_file = tempfile.NamedTemporaryFile() # could be used for LM
|
| 186 |
if autolm and any([cnt > 2 for cnt in word_counts.values()]):
|
| 187 |
yield transcription, logs.add(f"Creating unigram LM...", False)
|
| 188 |
+
lm_path = tmp_file.name
|
| 189 |
create_unigram_lm(word_counts, num_sentences, lm_path)
|
| 190 |
yield transcription, logs.add(f"OK")
|
| 191 |
|
|
|
|
| 192 |
if lm_path is None:
|
| 193 |
yield transcription, logs.add(f"Filtering lexicon....")
|
| 194 |
lexicon = filter_lexicon(lexicon, word_counts)
|
|
|
|
| 196 |
f"Ok. Leixcon size after filtering: {len(lexicon)}"
|
| 197 |
)
|
| 198 |
else:
|
| 199 |
+
# kenlm throws an error if unigram LM is being used
|
| 200 |
+
# HACK: generate a bigram LM from unigram LM and a dummy bigram to trick it
|
| 201 |
maybe_generate_pseudo_bigram_arpa(lm_path)
|
| 202 |
|
| 203 |
# for k, v in lexicon.items():
|