Spaces:

mfidabel
/

guarani-speech-recognition

Running on Zero

App Files Files Community

guarani-speech-recognition / app.py

mfidabel

Update app.py

e566d20 verified over 1 year ago

raw

history blame

3.12 kB

	import spaces
	import gradio as gr
	import numpy as np
	import torch
	from peft import PeftModel, PeftConfig
	from transformers import WhisperForConditionalGeneration, WhisperTokenizer, WhisperProcessor, AutomaticSpeechRecognitionPipeline

	peft_model_id = "mfidabel/Modelo_3_Whisper_Large_V3"
	language = "guarani"
	task = "transcribe"
	peft_config = PeftConfig.from_pretrained(peft_model_id)
	model = WhisperForConditionalGeneration.from_pretrained(
	peft_config.base_model_name_or_path, load_in_8bit=False, device_map="cuda:0"
	)
	model = PeftModel.from_pretrained(model, peft_model_id)
	model.merge_and_unload()
	tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
	processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
	feature_extractor = processor.feature_extractor
	forced_decoder_ids = processor.get_decoder_prompt_ids(language="english", task=task)

	pipeline = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)

	@spaces.GPU
	def transcribe(audio):
	if audio is None:
	return "Espera a que la grabación termine de subirse al servidor !! Intentelo de nuevo en unos segundos"

	sr, y = audio
	y = y.astype(np.float32)
	y /= np.max(np.abs(y))
	with torch.autocast("cuda"):
	return pipeline({"sampling_rate": sr, "raw": y}, generate_kwargs={"forced_decoder_ids": forced_decoder_ids}, max_new_tokens=255)["text"]

	examples = [
	"./examples/audio_1.mp3",
	"./examples/audio_2.mp3",
	"./examples/audio_3.mp3",
	"./examples/audio_4.mp3"
	]

	title = "# 🇵🇾 Reconocimiento de Voz en Guaraní"

	description = """Esta es una demostración del reconocimiento de voz en Guaraní utilizando el modelo speech-to-text [Whisper](https://arxiv.org/pdf/2212.04356.pdf)

	Autores:
	- Mateo Andrés Fidabel Gill
	- Santiago Ruben Acevedo Zarza
	"""

	audio_input = gr.Audio(value="./examples/audio_1.mp3",
	sources=["upload", "microphone"],
	label="🎤 Audio a transcribir",
	interactive=True)

	transcription = gr.Textbox(label="📝 Transcripción",
	interactive=False)

	with gr.Blocks() as demo:

	with gr.Row():
	# Model Title and Description
	gr.Markdown(title)
	gr.Markdown(description)

	with gr.Row():
	# Audio Input
	audio_input.render()

	with gr.Row():
	# Text Output
	transcription.render()

	with gr.Row():
	# Submit and Clear Buttons
	submit = gr.Button("📝 Transcribir el Audio")

	with gr.Row():
	gr.Examples(examples=examples,
	inputs=[audio_input],
	outputs=[transcription],
	fn=transcribe,
	label="Ejemplos")

	submit.click(transcribe,
	inputs=[audio_input],
	outputs = [transcription])


	demo.queue()
	demo.launch(share=True)