Spaces:
Running
on
Zero
Running
on
Zero
| import spaces | |
| import gradio as gr | |
| import numpy as np | |
| import torch | |
| from peft import PeftModel, PeftConfig | |
| from transformers import WhisperForConditionalGeneration, WhisperTokenizer, WhisperProcessor, AutomaticSpeechRecognitionPipeline | |
| peft_model_id = "mfidabel/Modelo_3_Whisper_Large_V3" | |
| language = "guarani" | |
| task = "transcribe" | |
| peft_config = PeftConfig.from_pretrained(peft_model_id) | |
| model = WhisperForConditionalGeneration.from_pretrained( | |
| peft_config.base_model_name_or_path, load_in_8bit=False, device_map="cuda:0" | |
| ) | |
| model = PeftModel.from_pretrained(model, peft_model_id) | |
| model.merge_and_unload() | |
| tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task) | |
| processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task) | |
| feature_extractor = processor.feature_extractor | |
| forced_decoder_ids = processor.get_decoder_prompt_ids(language="english", task=task) | |
| pipeline = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor) | |
| def transcribe(audio): | |
| if audio is None: | |
| return "Espera a que la grabaci贸n termine de subirse al servidor !! Intentelo de nuevo en unos segundos" | |
| sr, y = audio | |
| y = y.astype(np.float32) | |
| y /= np.max(np.abs(y)) | |
| with torch.autocast("cuda"): | |
| return pipeline({"sampling_rate": sr, "raw": y}, generate_kwargs={"forced_decoder_ids": forced_decoder_ids}, max_new_tokens=255)["text"] | |
| examples = [ | |
| "./examples/audio_1.mp3", | |
| "./examples/audio_2.mp3", | |
| "./examples/audio_3.mp3", | |
| "./examples/audio_4.mp3" | |
| ] | |
| title = "# 馃嚨馃嚲 Reconocimiento de Voz en Guaran铆" | |
| description = """Esta es una demostraci贸n del reconocimiento de voz en Guaran铆 utilizando el modelo speech-to-text [Whisper](https://arxiv.org/pdf/2212.04356.pdf) | |
| Autores: | |
| - Mateo Andr茅s Fidabel Gill | |
| - Santiago Ruben Acevedo Zarza | |
| """ | |
| audio_input = gr.Audio(value="./examples/audio_1.mp3", | |
| sources=["upload", "microphone"], | |
| label="馃帳 Audio a transcribir", | |
| interactive=True) | |
| transcription = gr.Textbox(label="馃摑 Transcripci贸n", | |
| interactive=False) | |
| with gr.Blocks() as demo: | |
| with gr.Row(): | |
| # Model Title and Description | |
| gr.Markdown(title) | |
| gr.Markdown(description) | |
| with gr.Row(): | |
| # Audio Input | |
| audio_input.render() | |
| with gr.Row(): | |
| # Text Output | |
| transcription.render() | |
| with gr.Row(): | |
| # Submit and Clear Buttons | |
| submit = gr.Button("馃摑 Transcribir el Audio") | |
| with gr.Row(): | |
| gr.Examples(examples=examples, | |
| inputs=[audio_input], | |
| outputs=[transcription], | |
| fn=transcribe, | |
| label="Ejemplos") | |
| submit.click(transcribe, | |
| inputs=[audio_input], | |
| outputs = [transcription]) | |
| demo.queue() | |
| demo.launch(share=True) | |