import gradio as gr import ffmpeg import torch from sys import platform from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor from transformers.utils import is_flash_attn_2_available gr.load("models/openai/whisper-small").launch() # pipe = None # def extract_audio(video_path): # output_audio_path = 'audio_extraido.wav' # ffmpeg.input(video_path).output(output_audio_path).run() # def create_pipe(model='openai/whisper-small'): # if torch.cuda.is_available(): # device = "cuda:0" # elif platform == "darwin": # device = "mps" # else: # device = "cpu" # torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 # model_id = model # model = AutoModelForSpeechSeq2Seq.from_pretrained( # model_id, # torch_dtype=torch_dtype, # low_cpu_mem_usage=True, # use_safetensors=True, # attn_implementation="flash_attention_2" if is_flash_attn_2_available() else "sdpa", # ) # model.to(device) # processor = AutoProcessor.from_pretrained(model_id) # pipe = pipeline( # "automatic-speech-recognition", # model=model, # tokenizer=processor.tokenizer, # feature_extractor=processor.feature_extractor, # torch_dtype=torch_dtype, # device=device, # ) # return pipe # def transcribe_test(file, progress=gr.Progress()): # global pipe # progress(0, desc="Trabalhando..") # generate_kwargs = {} # generate_kwargs["language"] = "Portuguese" # generate_kwargs["task"] = "transcribe" # outputs = pipe( # file, # chunk_length_s=30,#30 # batch_size=24,#24 # generate_kwargs=generate_kwargs, # # return_timestamps=True, # ) # return outputs['chunks'] # with gr.Blocks(title="Para a Livia de Passos :)") as demo: # description = "Vamos tentar transcrever o texto com a voz da samara..." # transcribe = gr.Interface(fn=transcribe_test, # description=description, # inputs=[ # gr.File(label="Coloque o arquivo aquii", file_types=['.mp4','.mp3','.wav']) # ], # outputs=[gr.Text(label="Transcription"),]) # if __name__ == "__main__": # demo.launch()