from pprint import pformat import numpy as np import torch import torchaudio from torchaudio.transforms import Resample from huggingface_hub import hf_hub_download import gradio as gr from pipeline import PreTrainedPipeline HF_HUB_URL = 'ales/wav2vec2-cv-be' LM_HUB_FP = 'language_model/cv8be_5gram.bin' def main(audio_fp: str): audio, sampling_rate = torchaudio.load(audio_fp, normalize=True) init_audio_shape = audio.shape # convert stereo to mono converted_to_mono = False if audio.shape[0] > 1: audio = torch.mean(audio, dim=0, keepdim=True) converted_to_mono = True # resample audio to 16kHz resampler = Resample(orig_freq=sampling_rate, new_freq=16_000) audio_resampled = resampler(audio) inputs = audio_resampled.numpy().flatten() # cast to numpy as expected by the pipeline # download Language Model from HF Hub lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP) # init pipeline pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp) # recognize speech pipeline_res = pipeline(inputs=inputs) text = pipeline_res['text'][0] # unpack batch of size 1 # add technical information to the output tech_data = pipeline_res del tech_data['text'] tech_data['sampling_rate_orig'] = sampling_rate tech_data['init_audio_shape'] = init_audio_shape tech_data['converted_to_mono'] = converted_to_mono tech_data['resampled_audio_shape'] = audio_resampled.shape tech_data['inputs_shape'] = inputs.shape tech_data['inputs_max'] = np.max(inputs).item() tech_data['inputs_min'] = np.min(inputs).item() tech_data_str = pformat(tech_data) return text, tech_data_str iface = gr.Interface( fn=main, inputs=gr.inputs.Audio( source='microphone', type='filepath', label='Запішыце аўдыяфайл, каб распазнаць маўленьне' ), outputs=[ gr.outputs.Textbox(type='str', label='Распазнаны тэкст'), gr.outputs.Textbox(type='str', label='Тэхнічная інфармацыя') ], title='wav2vec2 fine-tuned on CommonVoice 8 Be + Language Model', description=('Мадэль распазнаваньня беларускага маўленьня, навучаная на датсэце Common Voice 8.\n' 'Акустычная мадэль + моўная мадэль.' ), ) iface.launch()