Spaces:

ales
/

wav2vec2-cv-be-lm

Running

App Files Files Community

ales commited on 9 days ago

Commit

adca0d8

1 Parent(s): 0952218

upd to latest gradio veresion; format

Browse files

Files changed (4) hide show

.gitignore +4 -0
app.py +36 -38
pipeline.py +13 -20
requirements.txt +1 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+.venv
+.env
+__pycache__
+.DS_Store

app.py CHANGED Viewed

@@ -1,16 +1,13 @@
 from pprint import pformat
-from huggingface_hub import hf_hub_download
-import librosa
 import gradio as gr
 from pipeline import PreTrainedPipeline
-HF_HUB_URL = 'ales/wav2vec2-cv-be'
-LM_HUB_FP = 'language_model/cv8be_5gram.bin'
 MODEL_SAMPLING_RATE = 16_000  # 16kHz
 # download Language Model from HF Hub
@@ -20,18 +17,18 @@ lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP)
 pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp)
-def main(recorded_audio_fp: str, uploaded_audio_fp: str):
     audio_fp = None
     if recorded_audio_fp is not None:
         audio_fp = recorded_audio_fp
-        used_audiofile = 'recorded'
     elif uploaded_audio_fp is not None:
         audio_fp = uploaded_audio_fp
-        used_audiofile = 'uploaded'
     else:
         return (
-            'Памылка! Вы мусіце альбо запісаць, альбо запампаваць аўдыяфайл.',
-            'Error! You have to either record or upload an audiofile.'
         )
     # read audio file
@@ -39,19 +36,19 @@ def main(recorded_audio_fp: str, uploaded_audio_fp: str):
     # recognize speech
     pipeline_res = pipeline(inputs=inputs)
-    text = pipeline_res['text'][0]  # unpack batch of size 1
     # add technical information to the output
     tech_data = pipeline_res
-    del tech_data['text']
-    tech_data['used_audiofile'] = used_audiofile
-    tech_data['recorded_file_present'] = recorded_audio_fp is not None
-    tech_data['uploaded_file_present'] = uploaded_audio_fp is not None
-    tech_data['audiofile_path'] = audio_fp
-    tech_data['model_sampling_rate'] = MODEL_SAMPLING_RATE
-    tech_data['inputs_shape'] = inputs.shape
-    tech_data['inputs_max'] = inputs.max().item()
-    tech_data['inputs_min'] = inputs.min().item()
     tech_data_str = pformat(tech_data)
@@ -67,26 +64,27 @@ The model used can be found here: [ales/wav2vec2-cv-be](https://huggingface.co/a
 iface = gr.Interface(
     fn=main,
     inputs=[
-        gr.inputs.Audio(
-            source='microphone', type='filepath',
-            label='Запішыце аўдыяфайл, каб распазнаць маўленьне',
-            optional=True,
         ),
-        gr.inputs.Audio(
-            source='upload', type='filepath',
-            label='Альбо загрузіце ўжо запісаны аўдыяфайл сюды',
-            optional=True
         ),
     ],
     outputs=[
-        gr.outputs.Textbox(type='str', label='Распазнаны тэкст'),
-        gr.outputs.Textbox(type='str', label='Тэхнічная інфармацыя')
     ],
-    title='wav2vec2 fine-tuned on CommonVoice 8 Be + Language Model',
-    description=('Мадэль распазнаваньня беларускага маўленьня, навучаная на датсэце Common Voice 8.\n'
-                 'Акустычная мадэль + моўная мадэль.'
-                 ),
-    article=article
 )
-iface.launch(enable_queue=True)

 from pprint import pformat
 import gradio as gr
+import librosa
+from huggingface_hub import hf_hub_download
 from pipeline import PreTrainedPipeline
+HF_HUB_URL = "ales/wav2vec2-cv-be"
+LM_HUB_FP = "language_model/cv8be_5gram.bin"
 MODEL_SAMPLING_RATE = 16_000  # 16kHz
 # download Language Model from HF Hub
 pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp)
+def main(recorded_audio_fp: str | None, uploaded_audio_fp: str | None):
     audio_fp = None
     if recorded_audio_fp is not None:
         audio_fp = recorded_audio_fp
+        used_audiofile = "recorded"
     elif uploaded_audio_fp is not None:
         audio_fp = uploaded_audio_fp
+        used_audiofile = "uploaded"
     else:
         return (
+            "Памылка! Вы мусіце альбо запісаць, альбо запампаваць аўдыяфайл.",
+            "Error! You have to either record or upload an audiofile.",
         )
     # read audio file
     # recognize speech
     pipeline_res = pipeline(inputs=inputs)
+    text = pipeline_res["text"][0]  # unpack batch of size 1
     # add technical information to the output
     tech_data = pipeline_res
+    del tech_data["text"]
+    tech_data["used_audiofile"] = used_audiofile
+    tech_data["recorded_file_present"] = recorded_audio_fp is not None
+    tech_data["uploaded_file_present"] = uploaded_audio_fp is not None
+    tech_data["audiofile_path"] = audio_fp
+    tech_data["model_sampling_rate"] = MODEL_SAMPLING_RATE
+    tech_data["inputs_shape"] = inputs.shape
+    tech_data["inputs_max"] = inputs.max().item()
+    tech_data["inputs_min"] = inputs.min().item()
     tech_data_str = pformat(tech_data)
 iface = gr.Interface(
     fn=main,
     inputs=[
+        gr.Audio(
+            sources=["microphone"],
+            type="filepath",
+            label="Запішыце аўдыяфайл, каб распазнаць маўленьне",
         ),
+        gr.Audio(
+            sources=["upload"],
+            type="filepath",
+            label="Альбо загрузіце ўжо запісаны аўдыяфайл сюды",
         ),
     ],
     outputs=[
+        gr.Textbox(label="Распазнаны тэкст"),
+        gr.Textbox(label="Тэхнічная інфармацыя"),
     ],
+    title="wav2vec2 fine-tuned on CommonVoice 8 Be + Language Model",
+    description=(
+        "Мадэль распазнаваньня беларускага маўленьня, навучаная на датсэце Common Voice 8.\n"
+        "Акустычная мадэль + моўная мадэль."
+    ),
+    article=article,
 )
+iface.launch()

pipeline.py CHANGED Viewed

@@ -1,23 +1,17 @@
-import numpy as np
 from typing import Dict
-import torch
 import pyctcdecode
-from transformers import (
-    Wav2Vec2Processor,
-    Wav2Vec2ProcessorWithLM,
-    Wav2Vec2ForCTC,
-)
-class PreTrainedPipeline():
     def __init__(self, model_path: str, language_model_fp: str):
         self.language_model_fp = language_model_fp
-        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
         self.model = Wav2Vec2ForCTC.from_pretrained(model_path)
         self.model.to(self.device)
@@ -25,7 +19,9 @@ class PreTrainedPipeline():
         self.sampling_rate = processor.feature_extractor.sampling_rate
         vocab = processor.tokenizer.get_vocab()
-        sorted_vocab_dict = [(char, ix) for char, ix in sorted(vocab.items(), key=lambda item: item[1])]
         self.decoder = pyctcdecode.build_ctcdecoder(
             labels=[x[0] for x in sorted_vocab_dict],
@@ -35,7 +31,7 @@ class PreTrainedPipeline():
         self.processor_with_lm = Wav2Vec2ProcessorWithLM(
             feature_extractor=processor.feature_extractor,
             tokenizer=processor.tokenizer,
-            decoder=self.decoder
         )
     def __call__(self, inputs: np.array) -> Dict[str, str]:
@@ -49,9 +45,8 @@ class PreTrainedPipeline():
         """
         input_values = self.processor_with_lm(
-            inputs, return_tensors="pt",
-            sampling_rate=self.sampling_rate
-        )['input_values']
         input_values = input_values.to(self.device)
@@ -60,8 +55,6 @@ class PreTrainedPipeline():
             model_outs = self.model(input_values)
         logits = model_outs.logits.cpu().detach().numpy()
-        text_predicted = self.processor_with_lm.batch_decode(logits)['text']
-        return {
-            "text": text_predicted
-        }

 from typing import Dict
+import numpy as np
 import pyctcdecode
+import torch
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM
+class PreTrainedPipeline:
     def __init__(self, model_path: str, language_model_fp: str):
         self.language_model_fp = language_model_fp
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.model = Wav2Vec2ForCTC.from_pretrained(model_path)
         self.model.to(self.device)
         self.sampling_rate = processor.feature_extractor.sampling_rate
         vocab = processor.tokenizer.get_vocab()
+        sorted_vocab_dict = [
+            (char, ix) for char, ix in sorted(vocab.items(), key=lambda item: item[1])
+        ]
         self.decoder = pyctcdecode.build_ctcdecoder(
             labels=[x[0] for x in sorted_vocab_dict],
         self.processor_with_lm = Wav2Vec2ProcessorWithLM(
             feature_extractor=processor.feature_extractor,
             tokenizer=processor.tokenizer,
+            decoder=self.decoder,
         )
     def __call__(self, inputs: np.array) -> Dict[str, str]:
         """
         input_values = self.processor_with_lm(
+            inputs, return_tensors="pt", sampling_rate=self.sampling_rate
+        )["input_values"]
         input_values = input_values.to(self.device)
             model_outs = self.model(input_values)
         logits = model_outs.logits.cpu().detach().numpy()
+        text_predicted = self.processor_with_lm.batch_decode(logits)["text"]
+        return {"text": text_predicted}

requirements.txt CHANGED Viewed

@@ -3,4 +3,5 @@ pyctcdecode==0.3.0
 torch
 torchaudio
 librosa
 https://github.com/kpu/kenlm/archive/master.zip

 torch
 torchaudio
 librosa
+gradio
 https://github.com/kpu/kenlm/archive/master.zip