Update README.md
Browse files
README.md
CHANGED
@@ -14,7 +14,7 @@ tags:
|
|
14 |
- audio
|
15 |
- speech
|
16 |
---
|
17 |
-
|
18 |
|
19 |
# GigaAM-v2-CTC with ngram LM and beamsearch 🤗 Hugging Face transformers
|
20 |
|
@@ -66,8 +66,13 @@ input_features = processor(wav[0], sampling_rate=16000, return_tensors="pt")
|
|
66 |
with torch.no_grad():
|
67 |
logits = model(**input_features).logits
|
68 |
|
69 |
-
# decoding with beamseach and LM
|
70 |
-
transcription = processor.batch_decode(
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
```
|
73 |
|
@@ -78,7 +83,13 @@ In our case (Conformer) `MODEL_STRIDE = 40` ms per timestamp.
|
|
78 |
|
79 |
```python
|
80 |
MODEL_STRIDE = 40
|
81 |
-
outputs = processor.batch_decode(
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
word_ts = [
|
83 |
{
|
84 |
"word": d["word"],
|
|
|
14 |
- audio
|
15 |
- speech
|
16 |
---
|
17 |
+
[](https://colab.research.google.com/gist/waveletdeboshir/07e39ae96f27331aa3e1e053c2c2f9e8/gigaam-ctc-hf-with-lm.ipynb)
|
18 |
|
19 |
# GigaAM-v2-CTC with ngram LM and beamsearch 🤗 Hugging Face transformers
|
20 |
|
|
|
66 |
with torch.no_grad():
|
67 |
logits = model(**input_features).logits
|
68 |
|
69 |
+
# decoding with beamseach and LM (tune alpha, beta, beam_width for your data)
|
70 |
+
transcription = processor.batch_decode(
|
71 |
+
logits=logits.numpy(),
|
72 |
+
beam_width=64,
|
73 |
+
alpha=0.5,
|
74 |
+
beta=0.5,
|
75 |
+
).text[0]
|
76 |
|
77 |
```
|
78 |
|
|
|
83 |
|
84 |
```python
|
85 |
MODEL_STRIDE = 40
|
86 |
+
outputs = processor.batch_decode(
|
87 |
+
logits=logits.numpy(),
|
88 |
+
beam_width=64,
|
89 |
+
alpha=0.5,
|
90 |
+
beta=0.5,
|
91 |
+
output_word_offsets=True
|
92 |
+
)
|
93 |
word_ts = [
|
94 |
{
|
95 |
"word": d["word"],
|