Spaces:

mms-meta
/

mms-zeroshot

Running

App Files Files Community

Vineel Pratap commited on Jul 8, 2024

Commit

f138a14

1 Parent(s): a7567f9

resampling fix

Browse files

Files changed (6) hide show

app.py +2 -1
requirements.txt +2 -2
utils/lm.py +9 -11
utils/norm_config.py +8 -9
utils/text_norm.py +15 -6
zeroshot.py +9 -8

app.py CHANGED Viewed

@@ -53,7 +53,8 @@ with gr.Blocks(css="style.css") as demo:
                         )
                     with gr.Column():
                         autolm = gr.Checkbox(
-                            label="Automatically create Unigram LM from text data", value=True
                         )
             btn = gr.Button("Submit", elem_id="submit")

                         )
                     with gr.Column():
                         autolm = gr.Checkbox(
+                            label="Automatically create Unigram LM from text data",
+                            value=True,
                         )
             btn = gr.Button("Submit", elem_id="submit")

requirements.txt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f328d8dae24238aaabf770e035eefc60f84e7bdd844cb04787e4b049b85e0e22
-size 171

 version https://git-lfs.github.com/spec/v1
+oid sha256:41b8b278a5c4d2fc182c7893bcc683ad261ab0612cea1da58aaed1b358fd9649
+size 164

utils/lm.py CHANGED Viewed

@@ -1,14 +1,15 @@
-# Creates unigram LM following KenLM
-import math
 import shutil, tempfile
 def calculate_log_probabilities(word_counts, num_sentences, n_smoothing=0.01):
     """
     Calculate log probabilities for each word in the corpus,
     including a special <unk> token for unknown words.
     """
-    total_words = sum(word_counts.values())
-    total_words += 2 * num_sentences # add counts for <s> and </s>
     # Adjust total for <unk>
     total_words_with_unk = total_words + 1  # Adding 1 for <unk>
     total_words_with_unk = total_words_with_unk + total_words_with_unk * n_smoothing
@@ -25,6 +26,7 @@ def calculate_log_probabilities(word_counts, num_sentences, n_smoothing=0.01):
     # Convert to log probabilities
     return {word: math.log10(prob) for word, prob in probabilities.items()}
 def maybe_generate_pseudo_bigram_arpa(arpa_fpath):
     with open(arpa_fpath, "r") as file:
         lines = file.readlines()
@@ -46,6 +48,7 @@ def maybe_generate_pseudo_bigram_arpa(arpa_fpath):
             file.write(line)
 def save_log_probabilities(log_probabilities, file_path):
     with open(file_path, "w") as file:
         file.write(f"\data\\")
@@ -59,13 +62,8 @@ def save_log_probabilities(log_probabilities, file_path):
             file.write(f"{log_prob}\t{word}\n")
         file.write(f"\n")
         file.write(f"\end\\")
 def create_unigram_lm(word_counts, num_sentences, file_path, n_smoothing=0.01):
     log_probs = calculate_log_probabilities(word_counts, num_sentences, n_smoothing)
     save_log_probabilities(log_probs, file_path)

+# Creates unigram LM following KenLM
+import math
 import shutil, tempfile
 def calculate_log_probabilities(word_counts, num_sentences, n_smoothing=0.01):
     """
     Calculate log probabilities for each word in the corpus,
     including a special <unk> token for unknown words.
     """
+    total_words = sum(word_counts.values())
+    total_words += 2 * num_sentences  # add counts for <s> and </s>
     # Adjust total for <unk>
     total_words_with_unk = total_words + 1  # Adding 1 for <unk>
     total_words_with_unk = total_words_with_unk + total_words_with_unk * n_smoothing
     # Convert to log probabilities
     return {word: math.log10(prob) for word, prob in probabilities.items()}
 def maybe_generate_pseudo_bigram_arpa(arpa_fpath):
     with open(arpa_fpath, "r") as file:
         lines = file.readlines()
             file.write(line)
 def save_log_probabilities(log_probabilities, file_path):
     with open(file_path, "w") as file:
         file.write(f"\data\\")
             file.write(f"{log_prob}\t{word}\n")
         file.write(f"\n")
         file.write(f"\end\\")
 def create_unigram_lm(word_counts, num_sentences, file_path, n_smoothing=0.01):
     log_probs = calculate_log_probabilities(word_counts, num_sentences, n_smoothing)
     save_log_probabilities(log_probs, file_path)

utils/norm_config.py CHANGED Viewed

@@ -42,7 +42,7 @@ inverted_question_mark = r"\u00BF"
 # Hindi
-hindi_danda = u"\u0964"
 # Egyptian Arabic
 # arabic_percent = r"\u066A"
@@ -175,7 +175,7 @@ nominal_digit_shapes = r"\u206f"
 with open(f"{os.path.dirname(__file__)}/punctuations.lst", "r") as punc_f:
     punc_list = punc_f.readlines()
-punct_pattern = r""
 for punc in punc_list:
     # the first character in the tab separated line is the punc to be removed
     punct_pattern += re.escape(punc.split("\t")[0])
@@ -213,7 +213,6 @@ shared_punc_list = (
     + arabic_question_mark
     + chinese_punc
     + punct_pattern
 )
 shared_mappping = {
@@ -242,11 +241,11 @@ norm_config = {
         "mapping": shared_mappping,
         "digit_set": shared_digits,
         "unicode_norm": "NFKC",
-        "rm_diacritics" : False,
     }
 }
-#=============== Mongolian ===============#
 norm_config["mon"] = norm_config["*"].copy()
 # add soft hyphen to punc list to match with fleurs
@@ -254,23 +253,23 @@ norm_config["mon"]["del_set"] += r"\u00AD"
 norm_config["khk"] = norm_config["mon"].copy()
-#=============== Hebrew ===============#
 norm_config["heb"] = norm_config["*"].copy()
 # add "HEBREW POINT" symbols to match with fleurs
 norm_config["heb"]["del_set"] += r"\u05B0-\u05BF\u05C0-\u05CF"
-#=============== Thai ===============#
 norm_config["tha"] = norm_config["*"].copy()
 # add "Zero width joiner" symbols to match with fleurs
 norm_config["tha"]["punc_set"] += r"\u200D"
-#=============== Arabic ===============#
 norm_config["ara"] = norm_config["*"].copy()
 norm_config["ara"]["mapping"]["ٱ"] = "ا"
 norm_config["arb"] = norm_config["ara"].copy()
-#=============== Javanese ===============#
 norm_config["jav"] = norm_config["*"].copy()
 norm_config["jav"]["rm_diacritics"] = True

 # Hindi
+hindi_danda = "\u0964"
 # Egyptian Arabic
 # arabic_percent = r"\u066A"
 with open(f"{os.path.dirname(__file__)}/punctuations.lst", "r") as punc_f:
     punc_list = punc_f.readlines()
+punct_pattern = r""
 for punc in punc_list:
     # the first character in the tab separated line is the punc to be removed
     punct_pattern += re.escape(punc.split("\t")[0])
     + arabic_question_mark
     + chinese_punc
     + punct_pattern
 )
 shared_mappping = {
         "mapping": shared_mappping,
         "digit_set": shared_digits,
         "unicode_norm": "NFKC",
+        "rm_diacritics": False,
     }
 }
+# =============== Mongolian ===============#
 norm_config["mon"] = norm_config["*"].copy()
 # add soft hyphen to punc list to match with fleurs
 norm_config["khk"] = norm_config["mon"].copy()
+# =============== Hebrew ===============#
 norm_config["heb"] = norm_config["*"].copy()
 # add "HEBREW POINT" symbols to match with fleurs
 norm_config["heb"]["del_set"] += r"\u05B0-\u05BF\u05C0-\u05CF"
+# =============== Thai ===============#
 norm_config["tha"] = norm_config["*"].copy()
 # add "Zero width joiner" symbols to match with fleurs
 norm_config["tha"]["punc_set"] += r"\u200D"
+# =============== Arabic ===============#
 norm_config["ara"] = norm_config["*"].copy()
 norm_config["ara"]["mapping"]["ٱ"] = "ا"
 norm_config["arb"] = norm_config["ara"].copy()
+# =============== Javanese ===============#
 norm_config["jav"] = norm_config["*"].copy()
 norm_config["jav"]["rm_diacritics"] = True

utils/text_norm.py CHANGED Viewed

@@ -5,7 +5,9 @@ import unicodedata
 from utils.norm_config import norm_config
-def text_normalize(text, iso_code, lower_case=True, remove_numbers=True, remove_brackets=False):
     """Given a text, normalize it by changing to lower case, removing punctuations, removing words that only contain digits and removing extra spaces
@@ -15,17 +17,23 @@ def text_normalize(text, iso_code, lower_case=True, remove_numbers=True, remove_
         remove_numbers : Boolean flag to specify if words containing only digits should be removed
     Returns:
-        normalized_text : the string after all normalization
     """
     config = norm_config.get(iso_code, norm_config["*"])
-    for field in ["lower_case", "punc_set","del_set", "mapping", "digit_set", "unicode_norm"]:
         if field not in config:
             config[field] = norm_config["*"][field]
     text = unicodedata.normalize(config["unicode_norm"], text)
     # Convert to lower case
@@ -34,7 +42,7 @@ def text_normalize(text, iso_code, lower_case=True, remove_numbers=True, remove_
         text = text.lower()
     # brackets
     # always text inside brackets with numbers in them. Usually corresponds to "(Sam 23:17)"
     text = re.sub(r"\([^\)]*\d[^\)]*\)", " ", text)
     if remove_brackets:
@@ -84,9 +92,10 @@ def text_normalize(text, iso_code, lower_case=True, remove_numbers=True, remove_
     if config["rm_diacritics"]:
         from unidecode import unidecode
         normalized_text = unidecode(normalized_text)
     # Remove extra spaces
     normalized_text = re.sub(r"\s+", " ", normalized_text).strip()
-    return normalized_text

 from utils.norm_config import norm_config
+def text_normalize(
+    text, iso_code, lower_case=True, remove_numbers=True, remove_brackets=False
+):
     """Given a text, normalize it by changing to lower case, removing punctuations, removing words that only contain digits and removing extra spaces
         remove_numbers : Boolean flag to specify if words containing only digits should be removed
     Returns:
+        normalized_text : the string after all normalization
     """
     config = norm_config.get(iso_code, norm_config["*"])
+    for field in [
+        "lower_case",
+        "punc_set",
+        "del_set",
+        "mapping",
+        "digit_set",
+        "unicode_norm",
+    ]:
         if field not in config:
             config[field] = norm_config["*"][field]
     text = unicodedata.normalize(config["unicode_norm"], text)
     # Convert to lower case
         text = text.lower()
     # brackets
     # always text inside brackets with numbers in them. Usually corresponds to "(Sam 23:17)"
     text = re.sub(r"\([^\)]*\d[^\)]*\)", " ", text)
     if remove_brackets:
     if config["rm_diacritics"]:
         from unidecode import unidecode
         normalized_text = unidecode(normalized_text)
     # Remove extra spaces
     normalized_text = re.sub(r"\s+", " ", normalized_text).strip()
+    return normalized_text

zeroshot.py CHANGED Viewed

@@ -34,7 +34,7 @@ class MY_LOG:
     def __init__(self):
         self.text = "[START]"
-    def add(self, new_log, new_line= True):
         self.text = self.text + ("\n" if new_line else " ") + new_log
         self.text = self.text.strip()
         return self.text
@@ -127,7 +127,9 @@ def process(
         audio_samples = (audio_samples / 32768.0).astype(float)
         if sr != ASR_SAMPLING_RATE:
-            audio_samples = librosa.resample(audio_samples, sr, ASR_SAMPLING_RATE)
     else:
         # file upload
         assert isinstance(audio_data, str)
@@ -179,15 +181,14 @@ def process(
     #         print(k, v)
     yield transcription, logs.add(f"Leixcon size: {len(lexicon)}")
-    # Input could be sentences OR list of words. Check if atleast one word has a count > 1 to diffentiate
-    tmp_file = tempfile.NamedTemporaryFile() # could be used for LM
     if autolm and any([cnt > 2 for cnt in word_counts.values()]):
         yield transcription, logs.add(f"Creating unigram LM...", False)
-        lm_path = tmp_file.name
         create_unigram_lm(word_counts, num_sentences, lm_path)
         yield transcription, logs.add(f"OK")
     if lm_path is None:
         yield transcription, logs.add(f"Filtering lexicon....")
         lexicon = filter_lexicon(lexicon, word_counts)
@@ -195,8 +196,8 @@ def process(
             f"Ok. Leixcon size after filtering: {len(lexicon)}"
         )
     else:
-        # kenlm throws an error if unigram LM is being used
-        # HACK: generate a bigram LM from unigram LM and a dummy bigram to trick it
         maybe_generate_pseudo_bigram_arpa(lm_path)
     # for k, v in lexicon.items():

     def __init__(self):
         self.text = "[START]"
+    def add(self, new_log, new_line=True):
         self.text = self.text + ("\n" if new_line else " ") + new_log
         self.text = self.text.strip()
         return self.text
         audio_samples = (audio_samples / 32768.0).astype(float)
         if sr != ASR_SAMPLING_RATE:
+            audio_samples = librosa.resample(
+                audio_samples, orig_sr=sr, target_sr=ASR_SAMPLING_RATE
+            )
     else:
         # file upload
         assert isinstance(audio_data, str)
     #         print(k, v)
     yield transcription, logs.add(f"Leixcon size: {len(lexicon)}")
+    # Input could be sentences OR list of words. Check if atleast one word has a count > 1 to diffentiate
+    tmp_file = tempfile.NamedTemporaryFile()  # could be used for LM
     if autolm and any([cnt > 2 for cnt in word_counts.values()]):
         yield transcription, logs.add(f"Creating unigram LM...", False)
+        lm_path = tmp_file.name
         create_unigram_lm(word_counts, num_sentences, lm_path)
         yield transcription, logs.add(f"OK")
     if lm_path is None:
         yield transcription, logs.add(f"Filtering lexicon....")
         lexicon = filter_lexicon(lexicon, word_counts)
             f"Ok. Leixcon size after filtering: {len(lexicon)}"
         )
     else:
+        # kenlm throws an error if unigram LM is being used
+        # HACK: generate a bigram LM from unigram LM and a dummy bigram to trick it
         maybe_generate_pseudo_bigram_arpa(lm_path)
     # for k, v in lexicon.items():