ales commited on
Commit
adca0d8
·
1 Parent(s): 0952218

upd to latest gradio veresion; format

Browse files
Files changed (4) hide show
  1. .gitignore +4 -0
  2. app.py +36 -38
  3. pipeline.py +13 -20
  4. requirements.txt +1 -0
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ .venv
2
+ .env
3
+ __pycache__
4
+ .DS_Store
app.py CHANGED
@@ -1,16 +1,13 @@
1
  from pprint import pformat
2
 
3
- from huggingface_hub import hf_hub_download
4
-
5
- import librosa
6
-
7
  import gradio as gr
 
 
8
 
9
  from pipeline import PreTrainedPipeline
10
 
11
-
12
- HF_HUB_URL = 'ales/wav2vec2-cv-be'
13
- LM_HUB_FP = 'language_model/cv8be_5gram.bin'
14
  MODEL_SAMPLING_RATE = 16_000 # 16kHz
15
 
16
  # download Language Model from HF Hub
@@ -20,18 +17,18 @@ lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP)
20
  pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp)
21
 
22
 
23
- def main(recorded_audio_fp: str, uploaded_audio_fp: str):
24
  audio_fp = None
25
  if recorded_audio_fp is not None:
26
  audio_fp = recorded_audio_fp
27
- used_audiofile = 'recorded'
28
  elif uploaded_audio_fp is not None:
29
  audio_fp = uploaded_audio_fp
30
- used_audiofile = 'uploaded'
31
  else:
32
  return (
33
- 'Памылка! Вы мусіце альбо запісаць, альбо запампаваць аўдыяфайл.',
34
- 'Error! You have to either record or upload an audiofile.'
35
  )
36
 
37
  # read audio file
@@ -39,19 +36,19 @@ def main(recorded_audio_fp: str, uploaded_audio_fp: str):
39
 
40
  # recognize speech
41
  pipeline_res = pipeline(inputs=inputs)
42
- text = pipeline_res['text'][0] # unpack batch of size 1
43
 
44
  # add technical information to the output
45
  tech_data = pipeline_res
46
- del tech_data['text']
47
- tech_data['used_audiofile'] = used_audiofile
48
- tech_data['recorded_file_present'] = recorded_audio_fp is not None
49
- tech_data['uploaded_file_present'] = uploaded_audio_fp is not None
50
- tech_data['audiofile_path'] = audio_fp
51
- tech_data['model_sampling_rate'] = MODEL_SAMPLING_RATE
52
- tech_data['inputs_shape'] = inputs.shape
53
- tech_data['inputs_max'] = inputs.max().item()
54
- tech_data['inputs_min'] = inputs.min().item()
55
 
56
  tech_data_str = pformat(tech_data)
57
 
@@ -67,26 +64,27 @@ The model used can be found here: [ales/wav2vec2-cv-be](https://huggingface.co/a
67
  iface = gr.Interface(
68
  fn=main,
69
  inputs=[
70
- gr.inputs.Audio(
71
- source='microphone', type='filepath',
72
- label='Запішыце аўдыяфайл, каб распазнаць маўленьне',
73
- optional=True,
74
  ),
75
- gr.inputs.Audio(
76
- source='upload', type='filepath',
77
- label='Альбо загрузіце ўжо запісаны аўдыяфайл сюды',
78
- optional=True
79
  ),
80
  ],
81
  outputs=[
82
- gr.outputs.Textbox(type='str', label='Распазнаны тэкст'),
83
- gr.outputs.Textbox(type='str', label='Тэхнічная інфармацыя')
84
  ],
85
- title='wav2vec2 fine-tuned on CommonVoice 8 Be + Language Model',
86
- description=('Мадэль распазнаваньня беларускага маўленьня, навучаная на датсэце Common Voice 8.\n'
87
- 'Акустычная мадэль + моўная мадэль.'
88
- ),
89
- article=article
 
90
  )
91
 
92
- iface.launch(enable_queue=True)
 
1
  from pprint import pformat
2
 
 
 
 
 
3
  import gradio as gr
4
+ import librosa
5
+ from huggingface_hub import hf_hub_download
6
 
7
  from pipeline import PreTrainedPipeline
8
 
9
+ HF_HUB_URL = "ales/wav2vec2-cv-be"
10
+ LM_HUB_FP = "language_model/cv8be_5gram.bin"
 
11
  MODEL_SAMPLING_RATE = 16_000 # 16kHz
12
 
13
  # download Language Model from HF Hub
 
17
  pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp)
18
 
19
 
20
+ def main(recorded_audio_fp: str | None, uploaded_audio_fp: str | None):
21
  audio_fp = None
22
  if recorded_audio_fp is not None:
23
  audio_fp = recorded_audio_fp
24
+ used_audiofile = "recorded"
25
  elif uploaded_audio_fp is not None:
26
  audio_fp = uploaded_audio_fp
27
+ used_audiofile = "uploaded"
28
  else:
29
  return (
30
+ "Памылка! Вы мусіце альбо запісаць, альбо запампаваць аўдыяфайл.",
31
+ "Error! You have to either record or upload an audiofile.",
32
  )
33
 
34
  # read audio file
 
36
 
37
  # recognize speech
38
  pipeline_res = pipeline(inputs=inputs)
39
+ text = pipeline_res["text"][0] # unpack batch of size 1
40
 
41
  # add technical information to the output
42
  tech_data = pipeline_res
43
+ del tech_data["text"]
44
+ tech_data["used_audiofile"] = used_audiofile
45
+ tech_data["recorded_file_present"] = recorded_audio_fp is not None
46
+ tech_data["uploaded_file_present"] = uploaded_audio_fp is not None
47
+ tech_data["audiofile_path"] = audio_fp
48
+ tech_data["model_sampling_rate"] = MODEL_SAMPLING_RATE
49
+ tech_data["inputs_shape"] = inputs.shape
50
+ tech_data["inputs_max"] = inputs.max().item()
51
+ tech_data["inputs_min"] = inputs.min().item()
52
 
53
  tech_data_str = pformat(tech_data)
54
 
 
64
  iface = gr.Interface(
65
  fn=main,
66
  inputs=[
67
+ gr.Audio(
68
+ sources=["microphone"],
69
+ type="filepath",
70
+ label="Запішыце аўдыяфайл, каб распазнаць маўленьне",
71
  ),
72
+ gr.Audio(
73
+ sources=["upload"],
74
+ type="filepath",
75
+ label="Альбо загрузіце ўжо запісаны аўдыяфайл сюды",
76
  ),
77
  ],
78
  outputs=[
79
+ gr.Textbox(label="Распазнаны тэкст"),
80
+ gr.Textbox(label="Тэхнічная інфармацыя"),
81
  ],
82
+ title="wav2vec2 fine-tuned on CommonVoice 8 Be + Language Model",
83
+ description=(
84
+ "Мадэль распазнаваньня беларускага маўленьня, навучаная на датсэце Common Voice 8.\n"
85
+ "Акустычная мадэль + моўная мадэль."
86
+ ),
87
+ article=article,
88
  )
89
 
90
+ iface.launch()
pipeline.py CHANGED
@@ -1,23 +1,17 @@
1
- import numpy as np
2
-
3
  from typing import Dict
4
 
5
- import torch
6
  import pyctcdecode
7
-
8
- from transformers import (
9
- Wav2Vec2Processor,
10
- Wav2Vec2ProcessorWithLM,
11
- Wav2Vec2ForCTC,
12
- )
13
 
14
 
15
- class PreTrainedPipeline():
16
 
17
  def __init__(self, model_path: str, language_model_fp: str):
18
  self.language_model_fp = language_model_fp
19
 
20
- self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
21
  self.model = Wav2Vec2ForCTC.from_pretrained(model_path)
22
  self.model.to(self.device)
23
 
@@ -25,7 +19,9 @@ class PreTrainedPipeline():
25
  self.sampling_rate = processor.feature_extractor.sampling_rate
26
 
27
  vocab = processor.tokenizer.get_vocab()
28
- sorted_vocab_dict = [(char, ix) for char, ix in sorted(vocab.items(), key=lambda item: item[1])]
 
 
29
 
30
  self.decoder = pyctcdecode.build_ctcdecoder(
31
  labels=[x[0] for x in sorted_vocab_dict],
@@ -35,7 +31,7 @@ class PreTrainedPipeline():
35
  self.processor_with_lm = Wav2Vec2ProcessorWithLM(
36
  feature_extractor=processor.feature_extractor,
37
  tokenizer=processor.tokenizer,
38
- decoder=self.decoder
39
  )
40
 
41
  def __call__(self, inputs: np.array) -> Dict[str, str]:
@@ -49,9 +45,8 @@ class PreTrainedPipeline():
49
  """
50
 
51
  input_values = self.processor_with_lm(
52
- inputs, return_tensors="pt",
53
- sampling_rate=self.sampling_rate
54
- )['input_values']
55
 
56
  input_values = input_values.to(self.device)
57
 
@@ -60,8 +55,6 @@ class PreTrainedPipeline():
60
  model_outs = self.model(input_values)
61
  logits = model_outs.logits.cpu().detach().numpy()
62
 
63
- text_predicted = self.processor_with_lm.batch_decode(logits)['text']
64
 
65
- return {
66
- "text": text_predicted
67
- }
 
 
 
1
  from typing import Dict
2
 
3
+ import numpy as np
4
  import pyctcdecode
5
+ import torch
6
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM
 
 
 
 
7
 
8
 
9
+ class PreTrainedPipeline:
10
 
11
  def __init__(self, model_path: str, language_model_fp: str):
12
  self.language_model_fp = language_model_fp
13
 
14
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
15
  self.model = Wav2Vec2ForCTC.from_pretrained(model_path)
16
  self.model.to(self.device)
17
 
 
19
  self.sampling_rate = processor.feature_extractor.sampling_rate
20
 
21
  vocab = processor.tokenizer.get_vocab()
22
+ sorted_vocab_dict = [
23
+ (char, ix) for char, ix in sorted(vocab.items(), key=lambda item: item[1])
24
+ ]
25
 
26
  self.decoder = pyctcdecode.build_ctcdecoder(
27
  labels=[x[0] for x in sorted_vocab_dict],
 
31
  self.processor_with_lm = Wav2Vec2ProcessorWithLM(
32
  feature_extractor=processor.feature_extractor,
33
  tokenizer=processor.tokenizer,
34
+ decoder=self.decoder,
35
  )
36
 
37
  def __call__(self, inputs: np.array) -> Dict[str, str]:
 
45
  """
46
 
47
  input_values = self.processor_with_lm(
48
+ inputs, return_tensors="pt", sampling_rate=self.sampling_rate
49
+ )["input_values"]
 
50
 
51
  input_values = input_values.to(self.device)
52
 
 
55
  model_outs = self.model(input_values)
56
  logits = model_outs.logits.cpu().detach().numpy()
57
 
58
+ text_predicted = self.processor_with_lm.batch_decode(logits)["text"]
59
 
60
+ return {"text": text_predicted}
 
 
requirements.txt CHANGED
@@ -3,4 +3,5 @@ pyctcdecode==0.3.0
3
  torch
4
  torchaudio
5
  librosa
 
6
  https://github.com/kpu/kenlm/archive/master.zip
 
3
  torch
4
  torchaudio
5
  librosa
6
+ gradio
7
  https://github.com/kpu/kenlm/archive/master.zip