import gradio as gr import librosa import torch import torchaudio from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor import numpy as np processor = Wav2Vec2Processor.from_pretrained("maher13/arabic-iti") model = Wav2Vec2ForCTC.from_pretrained("maher13/arabic-iti").eval() def asr_transcript(audio_file, audio_file2): transcript = "" if audio_file : wav, sr = librosa.load(audio_file.name, sr=16000) input_values = processor(wav, sampling_rate=16000, return_tensors="pt", padding=True).input_values logits = model(input_values).logits with torch.no_grad(): predicted_ids = torch.argmax(logits, dim=-1) predicted_ids[predicted_ids == -100] = processor.tokenizer.pad_token_id transcription1 = processor.tokenizer.batch_decode(predicted_ids)[0] else: transcription1 = "N/A" if audio_file2 : wav, sr = librosa.load(audio_file2.name, sr=16000) input_values = processor(wav, sampling_rate=16000, return_tensors="pt", padding=True).input_values logits = model(input_values).logits with torch.no_grad(): predicted_ids = torch.argmax(logits, dim=-1) predicted_ids[predicted_ids == -100] = processor.tokenizer.pad_token_id transcription2 = processor.tokenizer.batch_decode(predicted_ids)[0] else : transcription2 = "N/A" return transcription1, transcription2 gradio_ui = gr.Interface( fn=asr_transcript, title="Speech to Text Graduation project \n sponsored by TensorGraph", inputs= [ gr.inputs.Audio(source = 'microphone', type="file", optional = True), gr.inputs.Audio(source = 'upload', type="file", optional = True) ], outputs=[ gr.outputs.Textbox(label="Auto-Transcript"), gr.outputs.Textbox(label="Auto-Transcript") ], ) #gradio_ui.launch(share=True) gradio_ui.launch(share=True)