vikram-iitm commited on
Commit
a81fe15
·
1 Parent(s): b215b6d

Initial commit for asr-demo

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.arpa filter=lfs diff=lfs merge=lfs -text
37
+ *.binary filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoProcessor, AutoModelForCTC
3
+ from pyctcdecode import build_ctcdecoder
4
+ import librosa
5
+ import torch
6
+
7
+ # Load models and LM files from local files (uploaded to Space)
8
+ processor = AutoProcessor.from_pretrained("ai4bharat/indicwav2vec-hindi")
9
+ model = AutoModelForCTC.from_pretrained("ai4bharat/indicwav2vec-hindi")
10
+
11
+ vocab = processor.tokenizer.get_vocab()
12
+ sorted_vocab = sorted(vocab.items(), key=lambda kv: kv[1])
13
+ tokens = [token for token, _ in sorted_vocab]
14
+
15
+ arpa_pruned = "./hindi_small_4gram_pruned_clean.arpa"
16
+ binary_pruned = "./hindi_small_4gram_pruned.binary"
17
+
18
+ def load_unigrams(arpa_path):
19
+ unigrams = {}
20
+ with open(arpa_path, encoding="utf-8") as f:
21
+ in_unigrams = False
22
+ for line in f:
23
+ line = line.strip()
24
+ if not in_unigrams:
25
+ if line == "\\1-grams:":
26
+ in_unigrams = True
27
+ continue
28
+ if line.startswith("\\"):
29
+ break
30
+ parts = line.split(maxsplit=2)
31
+ if len(parts) >= 2:
32
+ score = float(parts[0])
33
+ tok = parts[1]
34
+ unigrams[tok] = score
35
+ return unigrams
36
+
37
+ unigram_scores = load_unigrams(arpa_pruned)
38
+ decoder = build_ctcdecoder(tokens, binary_pruned, unigrams=unigram_scores)
39
+
40
+ def transcribe(audio):
41
+ if audio is None:
42
+ return "No audio provided."
43
+ sr, audio_np = audio
44
+ if sr != 16000:
45
+ audio_np = librosa.resample(audio_np, orig_sr=sr, target_sr=16000)
46
+ sr = 16000
47
+ inputs = processor(audio_np, sampling_rate=sr, return_tensors="pt")
48
+ with torch.no_grad():
49
+ logits = model(**inputs).logits.cpu().numpy()[0]
50
+ text = decoder.decode(logits)
51
+ return text
52
+
53
+ iface = gr.Interface(
54
+ fn=transcribe,
55
+ inputs=gr.Audio(sources=["microphone", "upload"], type="numpy", label="Record or Upload Audio"),
56
+ outputs="text",
57
+ title="Indic ASR Demo (Hindi)",
58
+ description="Record or upload Hindi audio. Get instant transcription using ASR + custom Language Model."
59
+ )
60
+
61
+ if __name__ == "__main__":
62
+ iface.launch()
hindi_small_4gram_pruned.binary ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd0483a6dc50e3baf988d5858f966578c9532e402a7128f29590e02e1180d800
3
+ size 282972866
hindi_small_4gram_pruned_clean.arpa ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96d44938524ca3ae7fb7a1470fc0dc3e11c2f6ecf0022ed2b6982f01b5d43dd1
3
+ size 535622088
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ transformers
2
+ pyctcdecode
3
+ torch
4
+ librosa
5
+ gradio