peter2000's picture
Update app.py
b10a5df
raw
history blame
2.09 kB
import gradio as gr
import librosa
import soundfile as sf
import torch
import warnings
import os
from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer, Wav2Vec2Model
warnings.filterwarnings("ignore")
#load wav2vec2 tokenizer and model
#from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
#from fastapi import FastAPI, HTTPException, File
#from transformers import pipeline
from speechbrain.pretrained import EncoderDecoderASR
asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-wav2vec2-commonvoice-rw", savedir="pretrained_models/asr-wav2vec2-commonvoice-rw")
asr_model.transcribe_file("speechbrain/asr-wav2vec2-commonvoice-rw/example.mp3")
# define speech-to-text function
def asr_transcript(audio, audio_microphone, model_params):
audio = audio_microphone if audio_microphone else audio
if audio == None and audio_microphone == None:
return "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)", "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)"
text = ""
if audio:
text = asr_model.transcribe_file(audio.name)
return text
else:
return "File not valid"
gradio_ui = gr.Interface(
fn=asr_transcript,
title="Kinyarwanda Speech Recognition",
description="Upload an audio clip or record from browser using microphone, and let AI do the hard work of transcribing.",
article = """
This demo showcases the pretrained model from deepspeech.
""",
inputs=[gr.inputs.Audio(label="Upload Audio File", type="file", optional=True), gr.inputs.Audio(source="microphone", type="file", optional=True, label="Record from microphone"), gr.inputs.Dropdown(choices=["deepspeech","coqui (soon)"], type="value", default="deepspeech", label="Select speech recognition model ", optional=False)],
outputs=[gr.outputs.Textbox(label="Recognized speech")]
)
gradio_ui.launch(share=True)