Spaces:
Sleeping
Sleeping
import gradio as gr | |
import librosa | |
import torch | |
import numpy as np | |
# Lazy init placeholders | |
processor = None | |
model = None | |
decoder = None | |
def init_models(): | |
global processor, model, decoder | |
if processor is not None: | |
return | |
from transformers import AutoProcessor, AutoModelForCTC | |
from pyctcdecode import build_ctcdecoder | |
processor = AutoProcessor.from_pretrained("ai4bharat/indicwav2vec-hindi") | |
model = AutoModelForCTC.from_pretrained("ai4bharat/indicwav2vec-hindi") | |
vocab = processor.tokenizer.get_vocab() | |
sorted_vocab = sorted(vocab.items(), key=lambda kv: kv[1]) | |
tokens = [tok for tok, _ in sorted_vocab] | |
arpa_path = "./hindi_small_4gram_pruned_clean.arpa" | |
binary_path = "./hindi_small_4gram_pruned.binary" | |
unigrams = {} | |
with open(arpa_path, encoding="utf-8") as f: | |
in_unigrams = False | |
for line in f: | |
line = line.strip() | |
if not in_unigrams: | |
if line == "\\1-grams:": | |
in_unigrams = True | |
continue | |
if line.startswith("\\"): | |
break | |
parts = line.split(maxsplit=2) | |
if len(parts) >= 2: | |
unigrams[parts[1]] = float(parts[0]) | |
decoder = build_ctcdecoder(tokens, binary_path, unigrams=unigrams) | |
def transcribe(audio_path): | |
if not audio_path: | |
return "No audio provided." | |
init_models() | |
# librosa.load will load and resample | |
audio_np, sr = librosa.load(audio_path, sr=16000) | |
audio_np = audio_np.astype(np.float32) | |
inputs = processor(audio_np, sampling_rate=sr, return_tensors="pt") | |
with torch.no_grad(): | |
logits = model(**inputs).logits.cpu().numpy()[0] | |
return decoder.decode(logits) | |
iface = gr.Interface( | |
fn=transcribe, | |
inputs=gr.Audio(type="filepath", label="Upload a WAV file"), | |
outputs="text", | |
title="Indic ASR Demo (Hindi)", | |
description="Upload a Hindi audio file (.wav) to see the transcription." | |
) | |
if __name__ == "__main__": | |
iface.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False) | |