Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	
		Vineel Pratap
		
	commited on
		
		
					Commit 
							
							Β·
						
						6f27821
	
1
								Parent(s):
							
							78e8beb
								
autolm
Browse files- app.py +10 -5
- upload/english/gutenberg_27045.txt +3 -0
- {normalization β utils}/README.txt +0 -0
- {normalization β utils}/__init__.py +0 -0
- utils/lm.py +71 -0
- {normalization β utils}/norm_config.py +0 -0
- {normalization β utils}/punctuations.lst +0 -0
- {normalization β utils}/text_norm.py +1 -1
- zeroshot.py +31 -7
    	
        app.py
    CHANGED
    
    | @@ -51,18 +51,22 @@ with gr.Blocks(css="style.css") as demo: | |
| 51 | 
             
                                        interactive=False,
         | 
| 52 | 
             
                                        label="Language Model Score",
         | 
| 53 | 
             
                                    )
         | 
|  | |
|  | |
|  | |
|  | |
| 54 | 
             
                        btn = gr.Button("Submit", elem_id="submit")
         | 
| 55 |  | 
| 56 | 
             
                        @gr.on(
         | 
| 57 | 
            -
                            inputs=[wscore_usedefault, lmscore_usedefault, lm_file],
         | 
| 58 | 
             
                            outputs=[wscore, lmscore],
         | 
| 59 | 
             
                        )
         | 
| 60 | 
            -
                        def update_slider(ws, ls, lm):
         | 
| 61 |  | 
| 62 | 
             
                            ws_slider = gr.Slider(
         | 
| 63 | 
             
                                minimum=-10.0,
         | 
| 64 | 
             
                                maximum=10.0,
         | 
| 65 | 
            -
                                value=LM_SCORE_DEFAULT if lm is not None else 0,
         | 
| 66 | 
             
                                step=0.1,
         | 
| 67 | 
             
                                interactive=not ws,
         | 
| 68 | 
             
                                label="Word Insertion Score",
         | 
| @@ -71,7 +75,7 @@ with gr.Blocks(css="style.css") as demo: | |
| 71 | 
             
                                minimum=-10.0,
         | 
| 72 | 
             
                                maximum=10.0,
         | 
| 73 | 
             
                                value=WORD_SCORE_DEFAULT_IF_NOLM
         | 
| 74 | 
            -
                                if lm is None
         | 
| 75 | 
             
                                else WORD_SCORE_DEFAULT_IF_LM,
         | 
| 76 | 
             
                                step=0.1,
         | 
| 77 | 
             
                                interactive=not ls,
         | 
| @@ -97,6 +101,7 @@ with gr.Blocks(css="style.css") as demo: | |
| 97 | 
             
                        lmscore,
         | 
| 98 | 
             
                        wscore_usedefault,
         | 
| 99 | 
             
                        lmscore_usedefault,
         | 
|  | |
| 100 | 
             
                        reference,
         | 
| 101 | 
             
                    ],
         | 
| 102 | 
             
                    outputs=[text, logs],
         | 
| @@ -118,7 +123,7 @@ with gr.Blocks(css="style.css") as demo: | |
| 118 | 
             
                        ],
         | 
| 119 | 
             
                        [
         | 
| 120 | 
             
                            "upload/english/english.mp3",
         | 
| 121 | 
            -
                            "upload/english/ | 
| 122 | 
             
                            " This is going to look at the code that we have in our configuration that we've already exported and compare it to our database, and we want to import",
         | 
| 123 | 
             
                        ],
         | 
| 124 | 
             
                    ],
         | 
|  | |
| 51 | 
             
                                        interactive=False,
         | 
| 52 | 
             
                                        label="Language Model Score",
         | 
| 53 | 
             
                                    )
         | 
| 54 | 
            +
                                with gr.Column():
         | 
| 55 | 
            +
                                    autolm = gr.Checkbox(
         | 
| 56 | 
            +
                                        label="Automatically create Unigram LM from text data", value=True
         | 
| 57 | 
            +
                                    )
         | 
| 58 | 
             
                        btn = gr.Button("Submit", elem_id="submit")
         | 
| 59 |  | 
| 60 | 
             
                        @gr.on(
         | 
| 61 | 
            +
                            inputs=[wscore_usedefault, lmscore_usedefault, lm_file, autolm],
         | 
| 62 | 
             
                            outputs=[wscore, lmscore],
         | 
| 63 | 
             
                        )
         | 
| 64 | 
            +
                        def update_slider(ws, ls, lm, alm):
         | 
| 65 |  | 
| 66 | 
             
                            ws_slider = gr.Slider(
         | 
| 67 | 
             
                                minimum=-10.0,
         | 
| 68 | 
             
                                maximum=10.0,
         | 
| 69 | 
            +
                                value=LM_SCORE_DEFAULT if (lm is not None or alm) else 0,
         | 
| 70 | 
             
                                step=0.1,
         | 
| 71 | 
             
                                interactive=not ws,
         | 
| 72 | 
             
                                label="Word Insertion Score",
         | 
|  | |
| 75 | 
             
                                minimum=-10.0,
         | 
| 76 | 
             
                                maximum=10.0,
         | 
| 77 | 
             
                                value=WORD_SCORE_DEFAULT_IF_NOLM
         | 
| 78 | 
            +
                                if (lm is None and not alm)
         | 
| 79 | 
             
                                else WORD_SCORE_DEFAULT_IF_LM,
         | 
| 80 | 
             
                                step=0.1,
         | 
| 81 | 
             
                                interactive=not ls,
         | 
|  | |
| 101 | 
             
                        lmscore,
         | 
| 102 | 
             
                        wscore_usedefault,
         | 
| 103 | 
             
                        lmscore_usedefault,
         | 
| 104 | 
            +
                        autolm,
         | 
| 105 | 
             
                        reference,
         | 
| 106 | 
             
                    ],
         | 
| 107 | 
             
                    outputs=[text, logs],
         | 
|  | |
| 123 | 
             
                        ],
         | 
| 124 | 
             
                        [
         | 
| 125 | 
             
                            "upload/english/english.mp3",
         | 
| 126 | 
            +
                            "upload/english/gutenberg_27045.txt",
         | 
| 127 | 
             
                            " This is going to look at the code that we have in our configuration that we've already exported and compare it to our database, and we want to import",
         | 
| 128 | 
             
                        ],
         | 
| 129 | 
             
                    ],
         | 
    	
        upload/english/gutenberg_27045.txt
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:a6cb4e9c754924333e37dde766098f862ddd079c81009c77454f377c96b9ac19
         | 
| 3 | 
            +
            size 84138
         | 
    	
        {normalization β utils}/README.txt
    RENAMED
    
    | 
            File without changes
         | 
    	
        {normalization β utils}/__init__.py
    RENAMED
    
    | 
            File without changes
         | 
    	
        utils/lm.py
    ADDED
    
    | @@ -0,0 +1,71 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Creates unigram LM following KenLM 
         | 
| 2 | 
            +
            import math 
         | 
| 3 | 
            +
            import shutil, tempfile
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            def calculate_log_probabilities(word_counts, num_sentences, n_smoothing=0.01):
         | 
| 6 | 
            +
                """
         | 
| 7 | 
            +
                Calculate log probabilities for each word in the corpus,
         | 
| 8 | 
            +
                including a special <unk> token for unknown words.
         | 
| 9 | 
            +
                """
         | 
| 10 | 
            +
                total_words = sum(word_counts.values()) 
         | 
| 11 | 
            +
                total_words += 2 * num_sentences # add counts for <s> and </s>
         | 
| 12 | 
            +
                # Adjust total for <unk>
         | 
| 13 | 
            +
                total_words_with_unk = total_words + 1  # Adding 1 for <unk>
         | 
| 14 | 
            +
                total_words_with_unk = total_words_with_unk + total_words_with_unk * n_smoothing
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                # Calculate probabilities, adjust for <unk>
         | 
| 17 | 
            +
                probabilities = {
         | 
| 18 | 
            +
                    word: ((count + n_smoothing) / total_words_with_unk)
         | 
| 19 | 
            +
                    for word, count in word_counts.items()
         | 
| 20 | 
            +
                }
         | 
| 21 | 
            +
                probabilities["<unk>"] = 1 / total_words_with_unk
         | 
| 22 | 
            +
                probabilities["<s>"] = (num_sentences + n_smoothing) / total_words_with_unk
         | 
| 23 | 
            +
                probabilities["</s>"] = (num_sentences + n_smoothing) / total_words_with_unk
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                # Convert to log probabilities
         | 
| 26 | 
            +
                return {word: math.log10(prob) for word, prob in probabilities.items()}
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            def maybe_generate_pseudo_bigram_arpa(arpa_fpath):
         | 
| 29 | 
            +
                with open(arpa_fpath, "r") as file:
         | 
| 30 | 
            +
                    lines = file.readlines()
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                # if ngram order >=2 , do not modify
         | 
| 33 | 
            +
                if any(["2-grams:" in l for l in lines]):
         | 
| 34 | 
            +
                    return
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                with open(arpa_fpath, "w") as file:
         | 
| 37 | 
            +
                    for line in lines:
         | 
| 38 | 
            +
                        if line.strip().startswith("ngram 1="):
         | 
| 39 | 
            +
                            file.write(line)
         | 
| 40 | 
            +
                            file.write("ngram 2=1\n")  # Add the new ngram line
         | 
| 41 | 
            +
                            continue
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                        if line.strip() == "\\end\\":
         | 
| 44 | 
            +
                            file.write("\\2-grams:\n")
         | 
| 45 | 
            +
                            file.write("-9.9999999\t</s> <s>\n\n")
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                        file.write(line)
         | 
| 48 | 
            +
             | 
| 49 | 
            +
            def save_log_probabilities(log_probabilities, file_path):
         | 
| 50 | 
            +
                with open(file_path, "w") as file:
         | 
| 51 | 
            +
                    file.write(f"\data\\")
         | 
| 52 | 
            +
                    file.write(f"\n")
         | 
| 53 | 
            +
                    file.write(f"ngram 1={len(log_probabilities)}\n\n")
         | 
| 54 | 
            +
                    file.write(f"\\1-grams:")
         | 
| 55 | 
            +
                    file.write(f"\n")
         | 
| 56 | 
            +
                    for word, log_prob in log_probabilities.items():
         | 
| 57 | 
            +
                        if word == "<s>":
         | 
| 58 | 
            +
                            log_prob = 0
         | 
| 59 | 
            +
                        file.write(f"{log_prob}\t{word}\n")
         | 
| 60 | 
            +
                    file.write(f"\n")
         | 
| 61 | 
            +
                    file.write(f"\end\\")
         | 
| 62 | 
            +
                
         | 
| 63 | 
            +
            def create_unigram_lm(word_counts, num_sentences, file_path, n_smoothing=0.01):
         | 
| 64 | 
            +
                log_probs = calculate_log_probabilities(word_counts, num_sentences, n_smoothing)
         | 
| 65 | 
            +
                save_log_probabilities(log_probs, file_path)
         | 
| 66 | 
            +
             | 
| 67 | 
            +
             | 
| 68 | 
            +
             | 
| 69 | 
            +
             | 
| 70 | 
            +
             | 
| 71 | 
            +
             | 
    	
        {normalization β utils}/norm_config.py
    RENAMED
    
    | 
            File without changes
         | 
    	
        {normalization β utils}/punctuations.lst
    RENAMED
    
    | 
            File without changes
         | 
    	
        {normalization β utils}/text_norm.py
    RENAMED
    
    | @@ -2,7 +2,7 @@ import json | |
| 2 | 
             
            import re
         | 
| 3 | 
             
            import unicodedata
         | 
| 4 |  | 
| 5 | 
            -
            from  | 
| 6 |  | 
| 7 |  | 
| 8 | 
             
            def text_normalize(text, iso_code, lower_case=True, remove_numbers=True, remove_brackets=False):
         | 
|  | |
| 2 | 
             
            import re
         | 
| 3 | 
             
            import unicodedata
         | 
| 4 |  | 
| 5 | 
            +
            from utils.norm_config import norm_config
         | 
| 6 |  | 
| 7 |  | 
| 8 | 
             
            def text_normalize(text, iso_code, lower_case=True, remove_numbers=True, remove_brackets=False):
         | 
    	
        zeroshot.py
    CHANGED
    
    | @@ -9,7 +9,8 @@ import numpy as np | |
| 9 | 
             
            from transformers import Wav2Vec2ForCTC, AutoProcessor
         | 
| 10 | 
             
            from huggingface_hub import hf_hub_download
         | 
| 11 | 
             
            from torchaudio.models.decoder import ctc_decoder
         | 
| 12 | 
            -
            from  | 
|  | |
| 13 |  | 
| 14 | 
             
            uroman_dir = "uroman"
         | 
| 15 | 
             
            assert os.path.exists(uroman_dir)
         | 
| @@ -33,8 +34,8 @@ class MY_LOG: | |
| 33 | 
             
                def __init__(self):
         | 
| 34 | 
             
                    self.text = "[START]"
         | 
| 35 |  | 
| 36 | 
            -
                def add(self, new_log):
         | 
| 37 | 
            -
                    self.text = self.text + "\n" + new_log
         | 
| 38 | 
             
                    self.text = self.text.strip()
         | 
| 39 | 
             
                    return self.text
         | 
| 40 |  | 
| @@ -92,15 +93,17 @@ def filter_lexicon(lexicon, word_counts): | |
| 92 |  | 
| 93 | 
             
            def load_words(filepath):
         | 
| 94 | 
             
                words = {}
         | 
|  | |
| 95 | 
             
                with open(filepath) as f:
         | 
| 96 | 
             
                    for line in f:
         | 
| 97 | 
             
                        line = line.strip().lower()
         | 
|  | |
| 98 | 
             
                        line = text_normalize(line, iso_code="xxx")
         | 
| 99 | 
             
                        # ignore invalid words.
         | 
| 100 | 
             
                        for w in line.split():
         | 
| 101 | 
             
                            words.setdefault(w, 0)
         | 
| 102 | 
             
                            words[w] += 1
         | 
| 103 | 
            -
                return words
         | 
| 104 |  | 
| 105 |  | 
| 106 | 
             
            def process(
         | 
| @@ -111,6 +114,7 @@ def process( | |
| 111 | 
             
                lmscore=None,
         | 
| 112 | 
             
                wscore_usedefault=True,
         | 
| 113 | 
             
                lmscore_usedefault=True,
         | 
|  | |
| 114 | 
             
                reference=None,
         | 
| 115 | 
             
            ):
         | 
| 116 | 
             
                transcription, logs = "", MY_LOG()
         | 
| @@ -154,13 +158,13 @@ def process( | |
| 154 | 
             
                # Setup lexicon and decoder
         | 
| 155 | 
             
                yield transcription, logs.add(f"Loading words....")
         | 
| 156 | 
             
                try:
         | 
| 157 | 
            -
                    word_counts = load_words(words_file)
         | 
| 158 | 
             
                except Exception as e:
         | 
| 159 | 
             
                    yield f"ERROR: Loading words failed '{str(e)}'", logs.text
         | 
| 160 | 
             
                    return
         | 
| 161 |  | 
| 162 | 
             
                yield transcription, logs.add(
         | 
| 163 | 
            -
                    f"Loaded {len(word_counts)} words.\nPreparing lexicon...."
         | 
| 164 | 
             
                )
         | 
| 165 |  | 
| 166 | 
             
                try:
         | 
| @@ -168,15 +172,35 @@ def process( | |
| 168 | 
             
                except Exception as e:
         | 
| 169 | 
             
                    yield f"ERROR: Creating lexicon failed '{str(e)}'", logs.text
         | 
| 170 | 
             
                    return
         | 
| 171 | 
            -
             | 
|  | |
|  | |
| 172 | 
             
                yield transcription, logs.add(f"Leixcon size: {len(lexicon)}")
         | 
| 173 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 174 | 
             
                if lm_path is None:
         | 
| 175 | 
             
                    yield transcription, logs.add(f"Filtering lexicon....")
         | 
| 176 | 
             
                    lexicon = filter_lexicon(lexicon, word_counts)
         | 
| 177 | 
             
                    yield transcription, logs.add(
         | 
| 178 | 
             
                        f"Ok. Leixcon size after filtering: {len(lexicon)}"
         | 
| 179 | 
             
                    )
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 180 | 
             
                # print(lexicon["the"], lexicon["\"(t)he"])
         | 
| 181 | 
             
                with tempfile.NamedTemporaryFile() as lexicon_file:
         | 
| 182 | 
             
                    if lm_path is not None and not lm_path.strip():
         | 
|  | |
| 9 | 
             
            from transformers import Wav2Vec2ForCTC, AutoProcessor
         | 
| 10 | 
             
            from huggingface_hub import hf_hub_download
         | 
| 11 | 
             
            from torchaudio.models.decoder import ctc_decoder
         | 
| 12 | 
            +
            from utils.text_norm import text_normalize
         | 
| 13 | 
            +
            from utils.lm import create_unigram_lm, maybe_generate_pseudo_bigram_arpa
         | 
| 14 |  | 
| 15 | 
             
            uroman_dir = "uroman"
         | 
| 16 | 
             
            assert os.path.exists(uroman_dir)
         | 
|  | |
| 34 | 
             
                def __init__(self):
         | 
| 35 | 
             
                    self.text = "[START]"
         | 
| 36 |  | 
| 37 | 
            +
                def add(self, new_log, new_line= True):
         | 
| 38 | 
            +
                    self.text = self.text + ("\n" if new_line else " ") + new_log
         | 
| 39 | 
             
                    self.text = self.text.strip()
         | 
| 40 | 
             
                    return self.text
         | 
| 41 |  | 
|  | |
| 93 |  | 
| 94 | 
             
            def load_words(filepath):
         | 
| 95 | 
             
                words = {}
         | 
| 96 | 
            +
                num_sentences = 0
         | 
| 97 | 
             
                with open(filepath) as f:
         | 
| 98 | 
             
                    for line in f:
         | 
| 99 | 
             
                        line = line.strip().lower()
         | 
| 100 | 
            +
                        num_sentences += 1
         | 
| 101 | 
             
                        line = text_normalize(line, iso_code="xxx")
         | 
| 102 | 
             
                        # ignore invalid words.
         | 
| 103 | 
             
                        for w in line.split():
         | 
| 104 | 
             
                            words.setdefault(w, 0)
         | 
| 105 | 
             
                            words[w] += 1
         | 
| 106 | 
            +
                return words, num_sentences
         | 
| 107 |  | 
| 108 |  | 
| 109 | 
             
            def process(
         | 
|  | |
| 114 | 
             
                lmscore=None,
         | 
| 115 | 
             
                wscore_usedefault=True,
         | 
| 116 | 
             
                lmscore_usedefault=True,
         | 
| 117 | 
            +
                autolm=True,
         | 
| 118 | 
             
                reference=None,
         | 
| 119 | 
             
            ):
         | 
| 120 | 
             
                transcription, logs = "", MY_LOG()
         | 
|  | |
| 158 | 
             
                # Setup lexicon and decoder
         | 
| 159 | 
             
                yield transcription, logs.add(f"Loading words....")
         | 
| 160 | 
             
                try:
         | 
| 161 | 
            +
                    word_counts, num_sentences = load_words(words_file)
         | 
| 162 | 
             
                except Exception as e:
         | 
| 163 | 
             
                    yield f"ERROR: Loading words failed '{str(e)}'", logs.text
         | 
| 164 | 
             
                    return
         | 
| 165 |  | 
| 166 | 
             
                yield transcription, logs.add(
         | 
| 167 | 
            +
                    f"Loaded {len(word_counts)} words from {num_sentences} lines.\nPreparing lexicon...."
         | 
| 168 | 
             
                )
         | 
| 169 |  | 
| 170 | 
             
                try:
         | 
|  | |
| 172 | 
             
                except Exception as e:
         | 
| 173 | 
             
                    yield f"ERROR: Creating lexicon failed '{str(e)}'", logs.text
         | 
| 174 | 
             
                    return
         | 
| 175 | 
            +
                # for k, v in lexicon.items():
         | 
| 176 | 
            +
                #     if len(v) < 5:
         | 
| 177 | 
            +
                #         print(k, v)
         | 
| 178 | 
             
                yield transcription, logs.add(f"Leixcon size: {len(lexicon)}")
         | 
| 179 |  | 
| 180 | 
            +
                # Input could be sentences OR list of words. Check if atleast one word has a count > 1 to diffentiate 
         | 
| 181 | 
            +
                tmp_file = tempfile.NamedTemporaryFile() # could be used for LM
         | 
| 182 | 
            +
                if autolm and any([cnt > 2 for cnt in word_counts.values()]):
         | 
| 183 | 
            +
                    yield transcription, logs.add(f"Creating unigram LM...", False)
         | 
| 184 | 
            +
                    lm_path = tmp_file.name 
         | 
| 185 | 
            +
                    create_unigram_lm(word_counts, num_sentences, lm_path)
         | 
| 186 | 
            +
                    yield transcription, logs.add(f"OK")
         | 
| 187 | 
            +
             | 
| 188 | 
            +
                
         | 
| 189 | 
             
                if lm_path is None:
         | 
| 190 | 
             
                    yield transcription, logs.add(f"Filtering lexicon....")
         | 
| 191 | 
             
                    lexicon = filter_lexicon(lexicon, word_counts)
         | 
| 192 | 
             
                    yield transcription, logs.add(
         | 
| 193 | 
             
                        f"Ok. Leixcon size after filtering: {len(lexicon)}"
         | 
| 194 | 
             
                    )
         | 
| 195 | 
            +
                else:
         | 
| 196 | 
            +
                    # kenlm throws an error if unigram LM is being used  
         | 
| 197 | 
            +
                    # HACK: generate a bigram LM from unigram LM and a dummy bigram to trick it 
         | 
| 198 | 
            +
                    maybe_generate_pseudo_bigram_arpa(lm_path)
         | 
| 199 | 
            +
             | 
| 200 | 
            +
                # for k, v in lexicon.items():
         | 
| 201 | 
            +
                #     if len(v) < 5:
         | 
| 202 | 
            +
                #         print(k, v)
         | 
| 203 | 
            +
             | 
| 204 | 
             
                # print(lexicon["the"], lexicon["\"(t)he"])
         | 
| 205 | 
             
                with tempfile.NamedTemporaryFile() as lexicon_file:
         | 
| 206 | 
             
                    if lm_path is not None and not lm_path.strip():
         | 
