import os import torch import json import gradio from faster_whisper import WhisperModel # Set the allowed file extensions to WAV, MP3, FLAC and OGG ALLOWED_EXTENSIONS = {'wav', 'mp3', 'flac', 'ogg'} # Initialize model model_size = "large-v1" model = WhisperModel(model_size, device="cpu", compute_type="float32") # Define a function to check if a file has an allowed extension def allowed_file(filename): return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS def segment_to_json(segment): segments = [] for s in segment: segments.append({ "Offset": s.start, "Duration": round(s.end - s.start, 3), "Channel": 0, "DisplayText": s.text.strip(), "NBest": [ { "Lexical": s.text.strip(), "Display": s.text.strip(), "Words": [{ "Word": w.word.strip(), "Offset": w.start, "Duration": round(w.end - w.start, 3), "Confidence": w.probability } for w in s.words] } ] }) return segments def process_audio(filename): print(filename) with torch.no_grad(): segments, info = model.transcribe(filename, beam_size=5, word_timestamps=True, task="translate", language="en") segments = list(segments) print(info) # Return the processed data as a JSON object result = segment_to_json(segments) os.system(f'rm "{filename}"') print(json.dumps(result, ensure_ascii=False)) return json.dumps(result, ensure_ascii=False) filename = gradio.Audio(type="filepath", label="Audio") interface = gradio.Interface(process_audio, inputs=[filename], outputs="text") interface.launch(show_api=True)