import gradio as gr import librosa import torch import numpy as np # Lazy init placeholders processor = None model = None decoder = None def init_models(): global processor, model, decoder if processor is not None: return from transformers import AutoProcessor, AutoModelForCTC from pyctcdecode import build_ctcdecoder processor = AutoProcessor.from_pretrained("ai4bharat/indicwav2vec-hindi") model = AutoModelForCTC.from_pretrained("ai4bharat/indicwav2vec-hindi") vocab = processor.tokenizer.get_vocab() sorted_vocab = sorted(vocab.items(), key=lambda kv: kv[1]) tokens = [tok for tok, _ in sorted_vocab] arpa_path = "./hindi_small_4gram_pruned_clean.arpa" binary_path = "./hindi_small_4gram_pruned.binary" unigrams = {} with open(arpa_path, encoding="utf-8") as f: in_unigrams = False for line in f: line = line.strip() if not in_unigrams: if line == "\\1-grams:": in_unigrams = True continue if line.startswith("\\"): break parts = line.split(maxsplit=2) if len(parts) >= 2: unigrams[parts[1]] = float(parts[0]) decoder = build_ctcdecoder(tokens, binary_path, unigrams=unigrams) def transcribe(audio_path): if not audio_path: return "No audio provided." init_models() # librosa.load will load and resample audio_np, sr = librosa.load(audio_path, sr=16000) audio_np = audio_np.astype(np.float32) inputs = processor(audio_np, sampling_rate=sr, return_tensors="pt") with torch.no_grad(): logits = model(**inputs).logits.cpu().numpy()[0] return decoder.decode(logits) iface = gr.Interface( fn=transcribe, inputs=gr.Audio(type="filepath", label="Upload a WAV file"), outputs="text", title="Indic ASR Demo (Hindi)", description="Upload a Hindi audio file (.wav) to see the transcription." ) if __name__ == "__main__": iface.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)