Spaces:

mms-meta
/

mms-zeroshot

Running

App Files Files Community

Vineel Pratap commited on Jul 8, 2024

Commit

6f27821

1 Parent(s): 78e8beb

autolm

Browse files

Files changed (9) hide show

app.py +10 -5
upload/english/gutenberg_27045.txt +3 -0
{normalization → utils}/README.txt +0 -0
{normalization → utils}/__init__.py +0 -0
utils/lm.py +71 -0
{normalization → utils}/norm_config.py +0 -0
{normalization → utils}/punctuations.lst +0 -0
{normalization → utils}/text_norm.py +1 -1
zeroshot.py +31 -7

app.py CHANGED Viewed

@@ -51,18 +51,22 @@ with gr.Blocks(css="style.css") as demo:
                             interactive=False,
                             label="Language Model Score",
                         )
             btn = gr.Button("Submit", elem_id="submit")
             @gr.on(
-                inputs=[wscore_usedefault, lmscore_usedefault, lm_file],
                 outputs=[wscore, lmscore],
             )
-            def update_slider(ws, ls, lm):
                 ws_slider = gr.Slider(
                     minimum=-10.0,
                     maximum=10.0,
-                    value=LM_SCORE_DEFAULT if lm is not None else 0,
                     step=0.1,
                     interactive=not ws,
                     label="Word Insertion Score",
@@ -71,7 +75,7 @@ with gr.Blocks(css="style.css") as demo:
                     minimum=-10.0,
                     maximum=10.0,
                     value=WORD_SCORE_DEFAULT_IF_NOLM
-                    if lm is None
                     else WORD_SCORE_DEFAULT_IF_LM,
                     step=0.1,
                     interactive=not ls,
@@ -97,6 +101,7 @@ with gr.Blocks(css="style.css") as demo:
             lmscore,
             wscore_usedefault,
             lmscore_usedefault,
             reference,
         ],
         outputs=[text, logs],
@@ -118,7 +123,7 @@ with gr.Blocks(css="style.css") as demo:
             ],
             [
                 "upload/english/english.mp3",
-                "upload/english/cv8_top10k_words.txt",
                 " This is going to look at the code that we have in our configuration that we've already exported and compare it to our database, and we want to import",
             ],
         ],

                             interactive=False,
                             label="Language Model Score",
                         )
+                    with gr.Column():
+                        autolm = gr.Checkbox(
+                            label="Automatically create Unigram LM from text data", value=True
+                        )
             btn = gr.Button("Submit", elem_id="submit")
             @gr.on(
+                inputs=[wscore_usedefault, lmscore_usedefault, lm_file, autolm],
                 outputs=[wscore, lmscore],
             )
+            def update_slider(ws, ls, lm, alm):
                 ws_slider = gr.Slider(
                     minimum=-10.0,
                     maximum=10.0,
+                    value=LM_SCORE_DEFAULT if (lm is not None or alm) else 0,
                     step=0.1,
                     interactive=not ws,
                     label="Word Insertion Score",
                     minimum=-10.0,
                     maximum=10.0,
                     value=WORD_SCORE_DEFAULT_IF_NOLM
+                    if (lm is None and not alm)
                     else WORD_SCORE_DEFAULT_IF_LM,
                     step=0.1,
                     interactive=not ls,
             lmscore,
             wscore_usedefault,
             lmscore_usedefault,
+            autolm,
             reference,
         ],
         outputs=[text, logs],
             ],
             [
                 "upload/english/english.mp3",
+                "upload/english/gutenberg_27045.txt",
                 " This is going to look at the code that we have in our configuration that we've already exported and compare it to our database, and we want to import",
             ],
         ],

upload/english/gutenberg_27045.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a6cb4e9c754924333e37dde766098f862ddd079c81009c77454f377c96b9ac19
+size 84138

{normalization → utils}/README.txt RENAMED Viewed

File without changes

{normalization → utils}/__init__.py RENAMED Viewed

File without changes

utils/lm.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# Creates unigram LM following KenLM
+import math
+import shutil, tempfile
+def calculate_log_probabilities(word_counts, num_sentences, n_smoothing=0.01):
+    """
+    Calculate log probabilities for each word in the corpus,
+    including a special <unk> token for unknown words.
+    """
+    total_words = sum(word_counts.values())
+    total_words += 2 * num_sentences # add counts for <s> and </s>
+    # Adjust total for <unk>
+    total_words_with_unk = total_words + 1  # Adding 1 for <unk>
+    total_words_with_unk = total_words_with_unk + total_words_with_unk * n_smoothing
+    # Calculate probabilities, adjust for <unk>
+    probabilities = {
+        word: ((count + n_smoothing) / total_words_with_unk)
+        for word, count in word_counts.items()
+    }
+    probabilities["<unk>"] = 1 / total_words_with_unk
+    probabilities["<s>"] = (num_sentences + n_smoothing) / total_words_with_unk
+    probabilities["</s>"] = (num_sentences + n_smoothing) / total_words_with_unk
+    # Convert to log probabilities
+    return {word: math.log10(prob) for word, prob in probabilities.items()}
+def maybe_generate_pseudo_bigram_arpa(arpa_fpath):
+    with open(arpa_fpath, "r") as file:
+        lines = file.readlines()
+    # if ngram order >=2 , do not modify
+    if any(["2-grams:" in l for l in lines]):
+        return
+    with open(arpa_fpath, "w") as file:
+        for line in lines:
+            if line.strip().startswith("ngram 1="):
+                file.write(line)
+                file.write("ngram 2=1\n")  # Add the new ngram line
+                continue
+            if line.strip() == "\\end\\":
+                file.write("\\2-grams:\n")
+                file.write("-9.9999999\t</s> <s>\n\n")
+            file.write(line)
+def save_log_probabilities(log_probabilities, file_path):
+    with open(file_path, "w") as file:
+        file.write(f"\data\\")
+        file.write(f"\n")
+        file.write(f"ngram 1={len(log_probabilities)}\n\n")
+        file.write(f"\\1-grams:")
+        file.write(f"\n")
+        for word, log_prob in log_probabilities.items():
+            if word == "<s>":
+                log_prob = 0
+            file.write(f"{log_prob}\t{word}\n")
+        file.write(f"\n")
+        file.write(f"\end\\")
+def create_unigram_lm(word_counts, num_sentences, file_path, n_smoothing=0.01):
+    log_probs = calculate_log_probabilities(word_counts, num_sentences, n_smoothing)
+    save_log_probabilities(log_probs, file_path)

{normalization → utils}/norm_config.py RENAMED Viewed

File without changes

{normalization → utils}/punctuations.lst RENAMED Viewed

File without changes

{normalization → utils}/text_norm.py RENAMED Viewed

@@ -2,7 +2,7 @@ import json
 import re
 import unicodedata
-from normalization.norm_config import norm_config
 def text_normalize(text, iso_code, lower_case=True, remove_numbers=True, remove_brackets=False):

 import re
 import unicodedata
+from utils.norm_config import norm_config
 def text_normalize(text, iso_code, lower_case=True, remove_numbers=True, remove_brackets=False):

zeroshot.py CHANGED Viewed

@@ -9,7 +9,8 @@ import numpy as np
 from transformers import Wav2Vec2ForCTC, AutoProcessor
 from huggingface_hub import hf_hub_download
 from torchaudio.models.decoder import ctc_decoder
-from normalization.text_norm import text_normalize
 uroman_dir = "uroman"
 assert os.path.exists(uroman_dir)
@@ -33,8 +34,8 @@ class MY_LOG:
     def __init__(self):
         self.text = "[START]"
-    def add(self, new_log):
-        self.text = self.text + "\n" + new_log
         self.text = self.text.strip()
         return self.text
@@ -92,15 +93,17 @@ def filter_lexicon(lexicon, word_counts):
 def load_words(filepath):
     words = {}
     with open(filepath) as f:
         for line in f:
             line = line.strip().lower()
             line = text_normalize(line, iso_code="xxx")
             # ignore invalid words.
             for w in line.split():
                 words.setdefault(w, 0)
                 words[w] += 1
-    return words
 def process(
@@ -111,6 +114,7 @@ def process(
     lmscore=None,
     wscore_usedefault=True,
     lmscore_usedefault=True,
     reference=None,
 ):
     transcription, logs = "", MY_LOG()
@@ -154,13 +158,13 @@ def process(
     # Setup lexicon and decoder
     yield transcription, logs.add(f"Loading words....")
     try:
-        word_counts = load_words(words_file)
     except Exception as e:
         yield f"ERROR: Loading words failed '{str(e)}'", logs.text
         return
     yield transcription, logs.add(
-        f"Loaded {len(word_counts)} words.\nPreparing lexicon...."
     )
     try:
@@ -168,15 +172,35 @@ def process(
     except Exception as e:
         yield f"ERROR: Creating lexicon failed '{str(e)}'", logs.text
         return
     yield transcription, logs.add(f"Leixcon size: {len(lexicon)}")
     if lm_path is None:
         yield transcription, logs.add(f"Filtering lexicon....")
         lexicon = filter_lexicon(lexicon, word_counts)
         yield transcription, logs.add(
             f"Ok. Leixcon size after filtering: {len(lexicon)}"
         )
     # print(lexicon["the"], lexicon["\"(t)he"])
     with tempfile.NamedTemporaryFile() as lexicon_file:
         if lm_path is not None and not lm_path.strip():

 from transformers import Wav2Vec2ForCTC, AutoProcessor
 from huggingface_hub import hf_hub_download
 from torchaudio.models.decoder import ctc_decoder
+from utils.text_norm import text_normalize
+from utils.lm import create_unigram_lm, maybe_generate_pseudo_bigram_arpa
 uroman_dir = "uroman"
 assert os.path.exists(uroman_dir)
     def __init__(self):
         self.text = "[START]"
+    def add(self, new_log, new_line= True):
+        self.text = self.text + ("\n" if new_line else " ") + new_log
         self.text = self.text.strip()
         return self.text
 def load_words(filepath):
     words = {}
+    num_sentences = 0
     with open(filepath) as f:
         for line in f:
             line = line.strip().lower()
+            num_sentences += 1
             line = text_normalize(line, iso_code="xxx")
             # ignore invalid words.
             for w in line.split():
                 words.setdefault(w, 0)
                 words[w] += 1
+    return words, num_sentences
 def process(
     lmscore=None,
     wscore_usedefault=True,
     lmscore_usedefault=True,
+    autolm=True,
     reference=None,
 ):
     transcription, logs = "", MY_LOG()
     # Setup lexicon and decoder
     yield transcription, logs.add(f"Loading words....")
     try:
+        word_counts, num_sentences = load_words(words_file)
     except Exception as e:
         yield f"ERROR: Loading words failed '{str(e)}'", logs.text
         return
     yield transcription, logs.add(
+        f"Loaded {len(word_counts)} words from {num_sentences} lines.\nPreparing lexicon...."
     )
     try:
     except Exception as e:
         yield f"ERROR: Creating lexicon failed '{str(e)}'", logs.text
         return
+    # for k, v in lexicon.items():
+    #     if len(v) < 5:
+    #         print(k, v)
     yield transcription, logs.add(f"Leixcon size: {len(lexicon)}")
+    # Input could be sentences OR list of words. Check if atleast one word has a count > 1 to diffentiate
+    tmp_file = tempfile.NamedTemporaryFile() # could be used for LM
+    if autolm and any([cnt > 2 for cnt in word_counts.values()]):
+        yield transcription, logs.add(f"Creating unigram LM...", False)
+        lm_path = tmp_file.name
+        create_unigram_lm(word_counts, num_sentences, lm_path)
+        yield transcription, logs.add(f"OK")
     if lm_path is None:
         yield transcription, logs.add(f"Filtering lexicon....")
         lexicon = filter_lexicon(lexicon, word_counts)
         yield transcription, logs.add(
             f"Ok. Leixcon size after filtering: {len(lexicon)}"
         )
+    else:
+        # kenlm throws an error if unigram LM is being used
+        # HACK: generate a bigram LM from unigram LM and a dummy bigram to trick it
+        maybe_generate_pseudo_bigram_arpa(lm_path)
+    # for k, v in lexicon.items():
+    #     if len(v) < 5:
+    #         print(k, v)
     # print(lexicon["the"], lexicon["\"(t)he"])
     with tempfile.NamedTemporaryFile() as lexicon_file:
         if lm_path is not None and not lm_path.strip():