asr-demo / app.py
vikram-iitm's picture
Add mic & upload tabs with numpy audio
dd534ea
import gradio as gr
import librosa
import torch
import numpy as np
# Lazy init placeholders
processor = None
model = None
decoder = None
def init_models():
global processor, model, decoder
if processor is not None:
return
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
processor = AutoProcessor.from_pretrained("ai4bharat/indicwav2vec-hindi")
model = AutoModelForCTC.from_pretrained("ai4bharat/indicwav2vec-hindi")
vocab = processor.tokenizer.get_vocab()
sorted_vocab = sorted(vocab.items(), key=lambda kv: kv[1])
tokens = [tok for tok, _ in sorted_vocab]
arpa_path = "./hindi_small_4gram_pruned_clean.arpa"
binary_path = "./hindi_small_4gram_pruned.binary"
unigrams = {}
with open(arpa_path, encoding="utf-8") as f:
in_unigrams = False
for line in f:
line = line.strip()
if not in_unigrams:
if line == "\\1-grams:":
in_unigrams = True
continue
if line.startswith("\\"):
break
parts = line.split(maxsplit=2)
if len(parts) >= 2:
unigrams[parts[1]] = float(parts[0])
decoder = build_ctcdecoder(tokens, binary_path, unigrams=unigrams)
def transcribe(audio_path):
if not audio_path:
return "No audio provided."
init_models()
# librosa.load will load and resample
audio_np, sr = librosa.load(audio_path, sr=16000)
audio_np = audio_np.astype(np.float32)
inputs = processor(audio_np, sampling_rate=sr, return_tensors="pt")
with torch.no_grad():
logits = model(**inputs).logits.cpu().numpy()[0]
return decoder.decode(logits)
iface = gr.Interface(
fn=transcribe,
inputs=gr.Audio(type="filepath", label="Upload a WAV file"),
outputs="text",
title="Indic ASR Demo (Hindi)",
description="Upload a Hindi audio file (.wav) to see the transcription."
)
if __name__ == "__main__":
iface.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)