beshiribrahim commited on
Commit
66f357d
Β·
verified Β·
1 Parent(s): 3518f84

Upload transcribe.ipynb

Browse files
Files changed (1) hide show
  1. transcribe.ipynb +683 -1
transcribe.ipynb CHANGED
@@ -1 +1,683 @@
1
- {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"machine_shape":"hm","mount_file_id":"15JwXGAHSNDvOfhIDFaj8h2bt5FijajiD","authorship_tag":"ABX9TyOIpJiulD+u85Vw+4eh9A8m"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","source":[],"metadata":{"id":"Pj64tkijY4tT","executionInfo":{"status":"ok","timestamp":1756685653771,"user_tz":240,"elapsed":5,"user":{"displayName":"Beshir Ibrahim","userId":"16736839346810179639"}}},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["%%capture\n","# Core libraries\n","!pip install torch torchaudio transformers pydub numpy pyctcdecode\n","# If you need mp3 input support\n","!sudo apt-get update -qq\n","!sudo apt-get install -y ffmpeg\n","# For KenLM ARPA/bin support\n","!pip install https://github.com/kpu/kenlm/archive/master.zip"],"metadata":{"id":"d6IIQn8_hEAy","executionInfo":{"status":"ok","timestamp":1756686608615,"user_tz":240,"elapsed":56491,"user":{"displayName":"Beshir Ibrahim","userId":"16736839346810179639"}}},"execution_count":9,"outputs":[]},{"cell_type":"code","source":["MODEL_PATH = \"/content/drive/MyDrive/artifacts/models/hf/hf_tgt/tigre-asr-Wav2Vec2Bert\" # model and processor path\n","PROCESSOR_PATH = MODEL_PATH\n","AUDIO_FILE = MODEL_PATH+\"/sample.wav\"\n","OUTPUT_TXT = None # e.g., \"/path/to/out.txt\" or None to just print\n","# KenLM + lexicon (optional but recommended for beam search)\n","KENLM_ARPA = MODEL_PATH+\"/lm.arpa\" # set to None to decode WITHOUT LM\n","LEXICON_TXT = MODEL_PATH+\"/lexicon.txt\" # used to load unigrams; set to None if not available"],"metadata":{"id":"B81FMlsQlSOh","executionInfo":{"status":"ok","timestamp":1756686366477,"user_tz":240,"elapsed":13,"user":{"displayName":"Beshir Ibrahim","userId":"16736839346810179639"}}},"execution_count":7,"outputs":[]},{"cell_type":"code","source":["import warnings\n","import logging\n","\n","# Silence all Python warnings\n","warnings.filterwarnings(\"ignore\")\n","# Silence pyctcdecode logger\n","logging.getLogger(\"pyctcdecode\").setLevel(logging.ERROR)\n","# Silence torchaudio warnings (optionally all)\n","logging.getLogger(\"torchaudio\").setLevel(logging.ERROR)"],"metadata":{"id":"Y90co7BOmK9n","executionInfo":{"status":"ok","timestamp":1756685692591,"user_tz":240,"elapsed":6,"user":{"displayName":"Beshir Ibrahim","userId":"16736839346810179639"}}},"execution_count":2,"outputs":[]},{"cell_type":"code","source":["# Audio / chunking\n","TARGET_SR = 16000\n","CHUNK_SEC = 5 # chunk length in seconds\n","OVERLAP_SEC = 0 # overlap between chunks in seconds (0 for minimal code)\n","# Beam search params\n","BEAM_WIDTH = 150\n","LM_ALPHA = 0.5\n","LM_BETA = 1.0"],"metadata":{"id":"7DOmsFxbnzwK","executionInfo":{"status":"ok","timestamp":1756685693508,"user_tz":240,"elapsed":5,"user":{"displayName":"Beshir Ibrahim","userId":"16736839346810179639"}}},"execution_count":3,"outputs":[]},{"cell_type":"code","source":["import os\n","import torch\n","import numpy as np\n","import torchaudio\n","from typing import List, Optional\n","\n","# Use pydub for robust mp3 handling\n","from pydub import AudioSegment\n","\n","from transformers import Wav2Vec2BertForCTC, Wav2Vec2BertProcessor\n","\n","# Optional LM decoding\n","try:\n"," from pyctcdecode import build_ctcdecoder\n"," _HAS_PYCTC = True\n","except Exception:\n"," _HAS_PYCTC = False\n","\n","# Pick device\n","device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n","\n","def _load_audio(path: str, target_sr: int = 16000) -> torch.Tensor:\n"," \"\"\"Load WAV or MP3 to mono float32 tensor [1, T] at target_sr.\"\"\"\n"," ext = os.path.splitext(path)[1].lower()\n"," if ext == \".mp3\":\n"," audio = AudioSegment.from_file(path, format=\"mp3\")\n"," audio = audio.set_channels(1).set_frame_rate(target_sr)\n"," samples = np.array(audio.get_array_of_samples()).astype(np.float32)\n"," # pydub gives int PCM range; normalize if needed (assume 16-bit)\n"," if samples.dtype != np.float32:\n"," samples = samples.astype(np.float32)\n"," # If sample_width==2 (16-bit), divide by 32768\n"," if audio.sample_width == 2:\n"," samples /= 32768.0\n"," return torch.from_numpy(samples).unsqueeze(0)\n"," else:\n"," wav, sr = torchaudio.load(path)\n"," if wav.shape[0] > 1:\n"," wav = wav.mean(dim=0, keepdim=True) # stereo -> mono\n"," if sr != target_sr:\n"," wav = torchaudio.transforms.Resample(sr, target_sr)(wav)\n"," # ensure float32 in [-1,1]\n"," if wav.dtype != torch.float32:\n"," wav = wav.to(torch.float32)\n"," return wav\n","\n","def _chunks(wave: torch.Tensor, sr: int, chunk_sec: int, overlap_sec: int):\n"," \"\"\"Yield possibly-overlapping chunks [1, T_chunk].\"\"\"\n"," chunk = int(chunk_sec * sr)\n"," step = max(1, chunk - int(overlap_sec * sr))\n"," T = wave.size(-1)\n"," for start in range(0, T, step):\n"," end = min(start + chunk, T)\n"," yield wave[:, start:end]\n"," if end >= T:\n"," break\n","\n","def _load_unigrams(lexicon_path: Optional[str]) -> List[str]:\n"," \"\"\"Read first token per line from lexicon into a unigram list.\"\"\"\n"," if not lexicon_path or not os.path.exists(lexicon_path):\n"," return []\n"," words = set()\n"," with open(lexicon_path, \"r\", encoding=\"utf-8\") as f:\n"," for line in f:\n"," w = line.strip().split()\n"," if w:\n"," words.add(w[0])\n"," return sorted(words)\n","\n","def _build_decoder(model, processor):\n"," \"\"\"Build a pyctcdecode decoder from model vocab + KenLM (if configured).\"\"\"\n"," # Build vocab (id -> token)\n"," vocab_size = model.lm_head.out_features\n"," labels = []\n"," for i in range(vocab_size):\n"," tok = processor.tokenizer.convert_ids_to_tokens([i])[0]\n"," # remove common BPE markers\n"," tok = tok.lstrip(\"Δ \").lstrip(\"▁\")\n"," labels.append(tok)\n","\n"," # No LM? Use labels only; with LM? also pass unigrams + alpha/beta\n"," if not _HAS_PYCTC:\n"," return None\n","\n"," if KENLM_ARPA and os.path.exists(KENLM_ARPA):\n"," unigrams = _load_unigrams(LEXICON_TXT)\n"," return build_ctcdecoder(\n"," labels=labels,\n"," kenlm_model_path=KENLM_ARPA,\n"," unigrams=unigrams if unigrams else None,\n"," alpha=LM_ALPHA,\n"," beta=LM_BETA\n"," )\n"," else:\n"," # Fallback to lexicon-less decoder (greedy-ish beam without LM)\n"," return build_ctcdecoder(labels=labels)\n","\n","def _postprocess(text: str) -> str:\n"," \"\"\"Light cleanup: strip special markers, collapse dup words, ensure end punctuation.\"\"\"\n"," text = text.replace(\"<|\", \"\").replace(\"|>\", \"\").replace(\"<>\", \"\").strip()\n"," words, cleaned = text.split(), []\n"," for w in words:\n"," if not cleaned or cleaned[-1] != w:\n"," cleaned.append(w)\n"," out = \" \".join(cleaned).strip()\n"," if out and out[-1] not in \".!?\":\n"," out += \".\"\n"," return out\n","\n","def transcribe_one_file() -> str:\n"," # Load model + processor\n"," model = Wav2Vec2BertForCTC.from_pretrained(MODEL_PATH).to(device).eval()\n"," processor = Wav2Vec2BertProcessor.from_pretrained(PROCESSOR_PATH)\n","\n"," # Optional decoder\n"," decoder = _build_decoder(model, processor)\n","\n"," # Load audio\n"," wav = _load_audio(AUDIO_FILE, TARGET_SR)\n","\n"," # Transcribe by chunks\n"," pieces = []\n"," for chunk in _chunks(wav, TARGET_SR, CHUNK_SEC, OVERLAP_SEC):\n"," # processor for Wav2Vec2Bert expects raw audio -> input_features\n"," inputs = processor(chunk.squeeze().numpy(), sampling_rate=TARGET_SR, return_tensors=\"pt\").to(device)\n"," with torch.no_grad():\n"," logits = model(input_features=inputs.input_features).logits # [1, T, V]\n"," logp = logits[0].cpu().numpy()\n","\n"," if decoder is not None:\n"," hypo = decoder.decode(logp, beam_width=BEAM_WIDTH)\n"," else:\n"," # Greedy fallback if pyctcdecode not available\n"," ids = logp.argmax(axis=-1)\n"," tokens = processor.tokenizer.convert_ids_to_tokens(ids.tolist())\n"," hypo = \"\".join(tokens)\n","\n"," if hypo.strip():\n"," pieces.append(hypo.strip())\n","\n"," # cleanup per chunk\n"," del inputs, logits, logp\n","\n"," text = _postprocess(\" \".join(pieces))\n"," return text\n","\n","if __name__ == \"__main__\":\n"," out = transcribe_one_file()\n"," if OUTPUT_TXT:\n"," os.makedirs(os.path.dirname(OUTPUT_TXT), exist_ok=True)\n"," with open(OUTPUT_TXT, \"w\", encoding=\"utf-8\") as f:\n"," f.write(out + \"\\n\")\n"," print(out)\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"W1rQvavueaBI","executionInfo":{"status":"ok","timestamp":1756686391018,"user_tz":240,"elapsed":15969,"user":{"displayName":"Beshir Ibrahim","userId":"16736839346810179639"}},"outputId":"8c5a1a6a-dd57-4b82-f891-0f4e45945a93"},"execution_count":8,"outputs":[{"output_type":"stream","name":"stdout","text":["αˆ•αˆ­αŒŠαŒŽ ምነ αˆαŠ• α‹˜α‰ αŠ• αŠ α‰΅αˆ«αŠ­ αŠ₯αŠ•α‹΄ αŠ αŠ•α‰ α‰°α‰΅ αŠ₯α‰₯ αˆ˜αˆ¨α‰£α‰€α‹ ግሩም αˆˆα‰΅αˆαˆŒ αˆ˜α‹²αŠα‰΅ α‰°.\n"]}]},{"cell_type":"code","source":[],"metadata":{"id":"qs6x1lHOlthS"},"execution_count":null,"outputs":[]}]}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "machine_shape": "hm"
8
+ },
9
+ "kernelspec": {
10
+ "name": "python3",
11
+ "display_name": "Python 3"
12
+ },
13
+ "language_info": {
14
+ "name": "python"
15
+ },
16
+ "widgets": {
17
+ "application/vnd.jupyter.widget-state+json": {
18
+ "ce0bb5a384f24c0e8a1d1b2620d34f6b": {
19
+ "model_module": "@jupyter-widgets/controls",
20
+ "model_name": "HBoxModel",
21
+ "model_module_version": "1.5.0",
22
+ "state": {
23
+ "_dom_classes": [],
24
+ "_model_module": "@jupyter-widgets/controls",
25
+ "_model_module_version": "1.5.0",
26
+ "_model_name": "HBoxModel",
27
+ "_view_count": null,
28
+ "_view_module": "@jupyter-widgets/controls",
29
+ "_view_module_version": "1.5.0",
30
+ "_view_name": "HBoxView",
31
+ "box_style": "",
32
+ "children": [
33
+ "IPY_MODEL_33cc7addcccd495f820391aeb5d08dd0",
34
+ "IPY_MODEL_ca564eec03874e6487ab7559580bba74",
35
+ "IPY_MODEL_143a309bafb64dca81327f331ace379e"
36
+ ],
37
+ "layout": "IPY_MODEL_f6db87a6251145f08348dfe0fd503ac1"
38
+ }
39
+ },
40
+ "33cc7addcccd495f820391aeb5d08dd0": {
41
+ "model_module": "@jupyter-widgets/controls",
42
+ "model_name": "HTMLModel",
43
+ "model_module_version": "1.5.0",
44
+ "state": {
45
+ "_dom_classes": [],
46
+ "_model_module": "@jupyter-widgets/controls",
47
+ "_model_module_version": "1.5.0",
48
+ "_model_name": "HTMLModel",
49
+ "_view_count": null,
50
+ "_view_module": "@jupyter-widgets/controls",
51
+ "_view_module_version": "1.5.0",
52
+ "_view_name": "HTMLView",
53
+ "description": "",
54
+ "description_tooltip": null,
55
+ "layout": "IPY_MODEL_77ad67ab59704d4abff46bbde1fdf8cc",
56
+ "placeholder": "​",
57
+ "style": "IPY_MODEL_8807473008224ebb92932a247d95ad6b",
58
+ "value": "Fetching 12 files: 100%"
59
+ }
60
+ },
61
+ "ca564eec03874e6487ab7559580bba74": {
62
+ "model_module": "@jupyter-widgets/controls",
63
+ "model_name": "FloatProgressModel",
64
+ "model_module_version": "1.5.0",
65
+ "state": {
66
+ "_dom_classes": [],
67
+ "_model_module": "@jupyter-widgets/controls",
68
+ "_model_module_version": "1.5.0",
69
+ "_model_name": "FloatProgressModel",
70
+ "_view_count": null,
71
+ "_view_module": "@jupyter-widgets/controls",
72
+ "_view_module_version": "1.5.0",
73
+ "_view_name": "ProgressView",
74
+ "bar_style": "success",
75
+ "description": "",
76
+ "description_tooltip": null,
77
+ "layout": "IPY_MODEL_4ac03a80fc1a45fc80a9bbc1e746ab21",
78
+ "max": 12,
79
+ "min": 0,
80
+ "orientation": "horizontal",
81
+ "style": "IPY_MODEL_fd67dc9a87864db6998a5d0de5b7cc59",
82
+ "value": 12
83
+ }
84
+ },
85
+ "143a309bafb64dca81327f331ace379e": {
86
+ "model_module": "@jupyter-widgets/controls",
87
+ "model_name": "HTMLModel",
88
+ "model_module_version": "1.5.0",
89
+ "state": {
90
+ "_dom_classes": [],
91
+ "_model_module": "@jupyter-widgets/controls",
92
+ "_model_module_version": "1.5.0",
93
+ "_model_name": "HTMLModel",
94
+ "_view_count": null,
95
+ "_view_module": "@jupyter-widgets/controls",
96
+ "_view_module_version": "1.5.0",
97
+ "_view_name": "HTMLView",
98
+ "description": "",
99
+ "description_tooltip": null,
100
+ "layout": "IPY_MODEL_50586c1a6dc340e5aba4e8dd6a1e2fcb",
101
+ "placeholder": "​",
102
+ "style": "IPY_MODEL_a51b63ac35054c1786e6b9d032fdc2ca",
103
+ "value": " 12/12 [00:00&lt;00:00, 812.26it/s]"
104
+ }
105
+ },
106
+ "f6db87a6251145f08348dfe0fd503ac1": {
107
+ "model_module": "@jupyter-widgets/base",
108
+ "model_name": "LayoutModel",
109
+ "model_module_version": "1.2.0",
110
+ "state": {
111
+ "_model_module": "@jupyter-widgets/base",
112
+ "_model_module_version": "1.2.0",
113
+ "_model_name": "LayoutModel",
114
+ "_view_count": null,
115
+ "_view_module": "@jupyter-widgets/base",
116
+ "_view_module_version": "1.2.0",
117
+ "_view_name": "LayoutView",
118
+ "align_content": null,
119
+ "align_items": null,
120
+ "align_self": null,
121
+ "border": null,
122
+ "bottom": null,
123
+ "display": null,
124
+ "flex": null,
125
+ "flex_flow": null,
126
+ "grid_area": null,
127
+ "grid_auto_columns": null,
128
+ "grid_auto_flow": null,
129
+ "grid_auto_rows": null,
130
+ "grid_column": null,
131
+ "grid_gap": null,
132
+ "grid_row": null,
133
+ "grid_template_areas": null,
134
+ "grid_template_columns": null,
135
+ "grid_template_rows": null,
136
+ "height": null,
137
+ "justify_content": null,
138
+ "justify_items": null,
139
+ "left": null,
140
+ "margin": null,
141
+ "max_height": null,
142
+ "max_width": null,
143
+ "min_height": null,
144
+ "min_width": null,
145
+ "object_fit": null,
146
+ "object_position": null,
147
+ "order": null,
148
+ "overflow": null,
149
+ "overflow_x": null,
150
+ "overflow_y": null,
151
+ "padding": null,
152
+ "right": null,
153
+ "top": null,
154
+ "visibility": null,
155
+ "width": null
156
+ }
157
+ },
158
+ "77ad67ab59704d4abff46bbde1fdf8cc": {
159
+ "model_module": "@jupyter-widgets/base",
160
+ "model_name": "LayoutModel",
161
+ "model_module_version": "1.2.0",
162
+ "state": {
163
+ "_model_module": "@jupyter-widgets/base",
164
+ "_model_module_version": "1.2.0",
165
+ "_model_name": "LayoutModel",
166
+ "_view_count": null,
167
+ "_view_module": "@jupyter-widgets/base",
168
+ "_view_module_version": "1.2.0",
169
+ "_view_name": "LayoutView",
170
+ "align_content": null,
171
+ "align_items": null,
172
+ "align_self": null,
173
+ "border": null,
174
+ "bottom": null,
175
+ "display": null,
176
+ "flex": null,
177
+ "flex_flow": null,
178
+ "grid_area": null,
179
+ "grid_auto_columns": null,
180
+ "grid_auto_flow": null,
181
+ "grid_auto_rows": null,
182
+ "grid_column": null,
183
+ "grid_gap": null,
184
+ "grid_row": null,
185
+ "grid_template_areas": null,
186
+ "grid_template_columns": null,
187
+ "grid_template_rows": null,
188
+ "height": null,
189
+ "justify_content": null,
190
+ "justify_items": null,
191
+ "left": null,
192
+ "margin": null,
193
+ "max_height": null,
194
+ "max_width": null,
195
+ "min_height": null,
196
+ "min_width": null,
197
+ "object_fit": null,
198
+ "object_position": null,
199
+ "order": null,
200
+ "overflow": null,
201
+ "overflow_x": null,
202
+ "overflow_y": null,
203
+ "padding": null,
204
+ "right": null,
205
+ "top": null,
206
+ "visibility": null,
207
+ "width": null
208
+ }
209
+ },
210
+ "8807473008224ebb92932a247d95ad6b": {
211
+ "model_module": "@jupyter-widgets/controls",
212
+ "model_name": "DescriptionStyleModel",
213
+ "model_module_version": "1.5.0",
214
+ "state": {
215
+ "_model_module": "@jupyter-widgets/controls",
216
+ "_model_module_version": "1.5.0",
217
+ "_model_name": "DescriptionStyleModel",
218
+ "_view_count": null,
219
+ "_view_module": "@jupyter-widgets/base",
220
+ "_view_module_version": "1.2.0",
221
+ "_view_name": "StyleView",
222
+ "description_width": ""
223
+ }
224
+ },
225
+ "4ac03a80fc1a45fc80a9bbc1e746ab21": {
226
+ "model_module": "@jupyter-widgets/base",
227
+ "model_name": "LayoutModel",
228
+ "model_module_version": "1.2.0",
229
+ "state": {
230
+ "_model_module": "@jupyter-widgets/base",
231
+ "_model_module_version": "1.2.0",
232
+ "_model_name": "LayoutModel",
233
+ "_view_count": null,
234
+ "_view_module": "@jupyter-widgets/base",
235
+ "_view_module_version": "1.2.0",
236
+ "_view_name": "LayoutView",
237
+ "align_content": null,
238
+ "align_items": null,
239
+ "align_self": null,
240
+ "border": null,
241
+ "bottom": null,
242
+ "display": null,
243
+ "flex": null,
244
+ "flex_flow": null,
245
+ "grid_area": null,
246
+ "grid_auto_columns": null,
247
+ "grid_auto_flow": null,
248
+ "grid_auto_rows": null,
249
+ "grid_column": null,
250
+ "grid_gap": null,
251
+ "grid_row": null,
252
+ "grid_template_areas": null,
253
+ "grid_template_columns": null,
254
+ "grid_template_rows": null,
255
+ "height": null,
256
+ "justify_content": null,
257
+ "justify_items": null,
258
+ "left": null,
259
+ "margin": null,
260
+ "max_height": null,
261
+ "max_width": null,
262
+ "min_height": null,
263
+ "min_width": null,
264
+ "object_fit": null,
265
+ "object_position": null,
266
+ "order": null,
267
+ "overflow": null,
268
+ "overflow_x": null,
269
+ "overflow_y": null,
270
+ "padding": null,
271
+ "right": null,
272
+ "top": null,
273
+ "visibility": null,
274
+ "width": null
275
+ }
276
+ },
277
+ "fd67dc9a87864db6998a5d0de5b7cc59": {
278
+ "model_module": "@jupyter-widgets/controls",
279
+ "model_name": "ProgressStyleModel",
280
+ "model_module_version": "1.5.0",
281
+ "state": {
282
+ "_model_module": "@jupyter-widgets/controls",
283
+ "_model_module_version": "1.5.0",
284
+ "_model_name": "ProgressStyleModel",
285
+ "_view_count": null,
286
+ "_view_module": "@jupyter-widgets/base",
287
+ "_view_module_version": "1.2.0",
288
+ "_view_name": "StyleView",
289
+ "bar_color": null,
290
+ "description_width": ""
291
+ }
292
+ },
293
+ "50586c1a6dc340e5aba4e8dd6a1e2fcb": {
294
+ "model_module": "@jupyter-widgets/base",
295
+ "model_name": "LayoutModel",
296
+ "model_module_version": "1.2.0",
297
+ "state": {
298
+ "_model_module": "@jupyter-widgets/base",
299
+ "_model_module_version": "1.2.0",
300
+ "_model_name": "LayoutModel",
301
+ "_view_count": null,
302
+ "_view_module": "@jupyter-widgets/base",
303
+ "_view_module_version": "1.2.0",
304
+ "_view_name": "LayoutView",
305
+ "align_content": null,
306
+ "align_items": null,
307
+ "align_self": null,
308
+ "border": null,
309
+ "bottom": null,
310
+ "display": null,
311
+ "flex": null,
312
+ "flex_flow": null,
313
+ "grid_area": null,
314
+ "grid_auto_columns": null,
315
+ "grid_auto_flow": null,
316
+ "grid_auto_rows": null,
317
+ "grid_column": null,
318
+ "grid_gap": null,
319
+ "grid_row": null,
320
+ "grid_template_areas": null,
321
+ "grid_template_columns": null,
322
+ "grid_template_rows": null,
323
+ "height": null,
324
+ "justify_content": null,
325
+ "justify_items": null,
326
+ "left": null,
327
+ "margin": null,
328
+ "max_height": null,
329
+ "max_width": null,
330
+ "min_height": null,
331
+ "min_width": null,
332
+ "object_fit": null,
333
+ "object_position": null,
334
+ "order": null,
335
+ "overflow": null,
336
+ "overflow_x": null,
337
+ "overflow_y": null,
338
+ "padding": null,
339
+ "right": null,
340
+ "top": null,
341
+ "visibility": null,
342
+ "width": null
343
+ }
344
+ },
345
+ "a51b63ac35054c1786e6b9d032fdc2ca": {
346
+ "model_module": "@jupyter-widgets/controls",
347
+ "model_name": "DescriptionStyleModel",
348
+ "model_module_version": "1.5.0",
349
+ "state": {
350
+ "_model_module": "@jupyter-widgets/controls",
351
+ "_model_module_version": "1.5.0",
352
+ "_model_name": "DescriptionStyleModel",
353
+ "_view_count": null,
354
+ "_view_module": "@jupyter-widgets/base",
355
+ "_view_module_version": "1.2.0",
356
+ "_view_name": "StyleView",
357
+ "description_width": ""
358
+ }
359
+ }
360
+ }
361
+ }
362
+ },
363
+ "cells": [
364
+ {
365
+ "cell_type": "code",
366
+ "source": [],
367
+ "metadata": {
368
+ "id": "Pj64tkijY4tT"
369
+ },
370
+ "execution_count": null,
371
+ "outputs": []
372
+ },
373
+ {
374
+ "cell_type": "code",
375
+ "source": [
376
+ "%%capture\n",
377
+ "# Core libraries\n",
378
+ "!pip install torch torchaudio transformers pydub numpy pyctcdecode\n",
379
+ "# If you need mp3 input support\n",
380
+ "!sudo apt-get update -qq\n",
381
+ "!sudo apt-get install -y ffmpeg\n",
382
+ "# For KenLM ARPA/bin support\n",
383
+ "!pip install https://github.com/kpu/kenlm/archive/master.zip"
384
+ ],
385
+ "metadata": {
386
+ "id": "d6IIQn8_hEAy"
387
+ },
388
+ "execution_count": 9,
389
+ "outputs": []
390
+ },
391
+ {
392
+ "cell_type": "code",
393
+ "source": [
394
+ "local_dir = \"/content/tigre-asr-Wav2Vec2Bert\" # or any local path\n",
395
+ "OUTPUT_TXT = None # e.g., \"/path/to/out.txt\" or None to just print"
396
+ ],
397
+ "metadata": {
398
+ "id": "5j0nwQYG2B4T"
399
+ },
400
+ "execution_count": 21,
401
+ "outputs": []
402
+ },
403
+ {
404
+ "cell_type": "code",
405
+ "source": [
406
+ "from huggingface_hub import snapshot_download\n",
407
+ "\n",
408
+ "MODEL_PATH = \"BeitTigreAI/tigre-asr-Wav2Vec2Bert\"\n",
409
+ "PROCESSOR_PATH = MODEL_PATH\n",
410
+ "\n",
411
+ "snapshot_download(\n",
412
+ " repo_id = MODEL_PATH,\n",
413
+ " repo_type = \"model\",\n",
414
+ " local_dir = local_dir,\n",
415
+ " local_dir_use_symlinks = False # copies files fully\n",
416
+ ")\n",
417
+ "\n",
418
+ "AUDIO_FILE = f\"{local_dir}/sample.wav\"\n",
419
+ "KENLM_ARPA = f\"{local_dir}/lm.arpa\" # if uploaded\n",
420
+ "LEXICON_TXT = f\"{local_dir}/lexicon.txt\""
421
+ ],
422
+ "metadata": {
423
+ "colab": {
424
+ "base_uri": "https://localhost:8080/",
425
+ "height": 49,
426
+ "referenced_widgets": [
427
+ "ce0bb5a384f24c0e8a1d1b2620d34f6b",
428
+ "33cc7addcccd495f820391aeb5d08dd0",
429
+ "ca564eec03874e6487ab7559580bba74",
430
+ "143a309bafb64dca81327f331ace379e",
431
+ "f6db87a6251145f08348dfe0fd503ac1",
432
+ "77ad67ab59704d4abff46bbde1fdf8cc",
433
+ "8807473008224ebb92932a247d95ad6b",
434
+ "4ac03a80fc1a45fc80a9bbc1e746ab21",
435
+ "fd67dc9a87864db6998a5d0de5b7cc59",
436
+ "50586c1a6dc340e5aba4e8dd6a1e2fcb",
437
+ "a51b63ac35054c1786e6b9d032fdc2ca"
438
+ ]
439
+ },
440
+ "id": "WNhgqJET08VF",
441
+ "outputId": "559f767c-d269-4908-ca5e-1bffb8c39395"
442
+ },
443
+ "execution_count": 22,
444
+ "outputs": [
445
+ {
446
+ "output_type": "display_data",
447
+ "data": {
448
+ "text/plain": [
449
+ "Fetching 12 files: 0%| | 0/12 [00:00<?, ?it/s]"
450
+ ],
451
+ "application/vnd.jupyter.widget-view+json": {
452
+ "version_major": 2,
453
+ "version_minor": 0,
454
+ "model_id": "ce0bb5a384f24c0e8a1d1b2620d34f6b"
455
+ }
456
+ },
457
+ "metadata": {}
458
+ }
459
+ ]
460
+ },
461
+ {
462
+ "cell_type": "code",
463
+ "source": [
464
+ "import warnings\n",
465
+ "import logging\n",
466
+ "\n",
467
+ "# Silence all Python warnings\n",
468
+ "warnings.filterwarnings(\"ignore\")\n",
469
+ "# Silence pyctcdecode logger\n",
470
+ "logging.getLogger(\"pyctcdecode\").setLevel(logging.ERROR)\n",
471
+ "# Silence torchaudio warnings (optionally all)\n",
472
+ "logging.getLogger(\"torchaudio\").setLevel(logging.ERROR)"
473
+ ],
474
+ "metadata": {
475
+ "id": "Y90co7BOmK9n"
476
+ },
477
+ "execution_count": 23,
478
+ "outputs": []
479
+ },
480
+ {
481
+ "cell_type": "code",
482
+ "source": [
483
+ "# Audio / chunking\n",
484
+ "TARGET_SR = 16000\n",
485
+ "CHUNK_SEC = 5 # chunk length in seconds\n",
486
+ "OVERLAP_SEC = 0 # overlap between chunks in seconds (0 for minimal code)\n",
487
+ "# Beam search params\n",
488
+ "BEAM_WIDTH = 150\n",
489
+ "LM_ALPHA = 0.5\n",
490
+ "LM_BETA = 1.0"
491
+ ],
492
+ "metadata": {
493
+ "id": "7DOmsFxbnzwK"
494
+ },
495
+ "execution_count": 24,
496
+ "outputs": []
497
+ },
498
+ {
499
+ "cell_type": "code",
500
+ "source": [
501
+ "import os\n",
502
+ "import torch\n",
503
+ "import numpy as np\n",
504
+ "import torchaudio\n",
505
+ "from typing import List, Optional\n",
506
+ "\n",
507
+ "# Use pydub for robust mp3 handling\n",
508
+ "from pydub import AudioSegment\n",
509
+ "\n",
510
+ "from transformers import Wav2Vec2BertForCTC, Wav2Vec2BertProcessor\n",
511
+ "\n",
512
+ "# Optional LM decoding\n",
513
+ "try:\n",
514
+ " from pyctcdecode import build_ctcdecoder\n",
515
+ " _HAS_PYCTC = True\n",
516
+ "except Exception:\n",
517
+ " _HAS_PYCTC = False\n",
518
+ "\n",
519
+ "# Pick device\n",
520
+ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
521
+ "\n",
522
+ "def _load_audio(path: str, target_sr: int = 16000) -> torch.Tensor:\n",
523
+ " \"\"\"Load WAV or MP3 to mono float32 tensor [1, T] at target_sr.\"\"\"\n",
524
+ " ext = os.path.splitext(path)[1].lower()\n",
525
+ " if ext == \".mp3\":\n",
526
+ " audio = AudioSegment.from_file(path, format=\"mp3\")\n",
527
+ " audio = audio.set_channels(1).set_frame_rate(target_sr)\n",
528
+ " samples = np.array(audio.get_array_of_samples()).astype(np.float32)\n",
529
+ " # pydub gives int PCM range; normalize if needed (assume 16-bit)\n",
530
+ " if samples.dtype != np.float32:\n",
531
+ " samples = samples.astype(np.float32)\n",
532
+ " # If sample_width==2 (16-bit), divide by 32768\n",
533
+ " if audio.sample_width == 2:\n",
534
+ " samples /= 32768.0\n",
535
+ " return torch.from_numpy(samples).unsqueeze(0)\n",
536
+ " else:\n",
537
+ " wav, sr = torchaudio.load(path)\n",
538
+ " if wav.shape[0] > 1:\n",
539
+ " wav = wav.mean(dim=0, keepdim=True) # stereo -> mono\n",
540
+ " if sr != target_sr:\n",
541
+ " wav = torchaudio.transforms.Resample(sr, target_sr)(wav)\n",
542
+ " # ensure float32 in [-1,1]\n",
543
+ " if wav.dtype != torch.float32:\n",
544
+ " wav = wav.to(torch.float32)\n",
545
+ " return wav\n",
546
+ "\n",
547
+ "def _chunks(wave: torch.Tensor, sr: int, chunk_sec: int, overlap_sec: int):\n",
548
+ " \"\"\"Yield possibly-overlapping chunks [1, T_chunk].\"\"\"\n",
549
+ " chunk = int(chunk_sec * sr)\n",
550
+ " step = max(1, chunk - int(overlap_sec * sr))\n",
551
+ " T = wave.size(-1)\n",
552
+ " for start in range(0, T, step):\n",
553
+ " end = min(start + chunk, T)\n",
554
+ " yield wave[:, start:end]\n",
555
+ " if end >= T:\n",
556
+ " break\n",
557
+ "\n",
558
+ "def _load_unigrams(lexicon_path: Optional[str]) -> List[str]:\n",
559
+ " \"\"\"Read first token per line from lexicon into a unigram list.\"\"\"\n",
560
+ " if not lexicon_path or not os.path.exists(lexicon_path):\n",
561
+ " return []\n",
562
+ " words = set()\n",
563
+ " with open(lexicon_path, \"r\", encoding=\"utf-8\") as f:\n",
564
+ " for line in f:\n",
565
+ " w = line.strip().split()\n",
566
+ " if w:\n",
567
+ " words.add(w[0])\n",
568
+ " return sorted(words)\n",
569
+ "\n",
570
+ "def _build_decoder(model, processor):\n",
571
+ " \"\"\"Build a pyctcdecode decoder from model vocab + KenLM (if configured).\"\"\"\n",
572
+ " # Build vocab (id -> token)\n",
573
+ " vocab_size = model.lm_head.out_features\n",
574
+ " labels = []\n",
575
+ " for i in range(vocab_size):\n",
576
+ " tok = processor.tokenizer.convert_ids_to_tokens([i])[0]\n",
577
+ " # remove common BPE markers\n",
578
+ " tok = tok.lstrip(\"Δ \").lstrip(\"▁\")\n",
579
+ " labels.append(tok)\n",
580
+ "\n",
581
+ " # No LM? Use labels only; with LM? also pass unigrams + alpha/beta\n",
582
+ " if not _HAS_PYCTC:\n",
583
+ " return None\n",
584
+ "\n",
585
+ " if KENLM_ARPA and os.path.exists(KENLM_ARPA):\n",
586
+ " unigrams = _load_unigrams(LEXICON_TXT)\n",
587
+ " return build_ctcdecoder(\n",
588
+ " labels=labels,\n",
589
+ " kenlm_model_path=KENLM_ARPA,\n",
590
+ " unigrams=unigrams if unigrams else None,\n",
591
+ " alpha=LM_ALPHA,\n",
592
+ " beta=LM_BETA\n",
593
+ " )\n",
594
+ " else:\n",
595
+ " # Fallback to lexicon-less decoder (greedy-ish beam without LM)\n",
596
+ " return build_ctcdecoder(labels=labels)\n",
597
+ "\n",
598
+ "def _postprocess(text: str) -> str:\n",
599
+ " \"\"\"Light cleanup: strip special markers, collapse dup words, ensure end punctuation.\"\"\"\n",
600
+ " text = text.replace(\"<|\", \"\").replace(\"|>\", \"\").replace(\"<>\", \"\").strip()\n",
601
+ " words, cleaned = text.split(), []\n",
602
+ " for w in words:\n",
603
+ " if not cleaned or cleaned[-1] != w:\n",
604
+ " cleaned.append(w)\n",
605
+ " out = \" \".join(cleaned).strip()\n",
606
+ " if out and out[-1] not in \".!?\":\n",
607
+ " out += \".\"\n",
608
+ " return out\n",
609
+ "\n",
610
+ "def transcribe_one_file() -> str:\n",
611
+ " # Load model + processor\n",
612
+ " model = Wav2Vec2BertForCTC.from_pretrained(MODEL_PATH).to(device).eval()\n",
613
+ " processor = Wav2Vec2BertProcessor.from_pretrained(PROCESSOR_PATH)\n",
614
+ "\n",
615
+ " # Optional decoder\n",
616
+ " decoder = _build_decoder(model, processor)\n",
617
+ "\n",
618
+ " # Load audio\n",
619
+ " wav = _load_audio(AUDIO_FILE, TARGET_SR)\n",
620
+ "\n",
621
+ " # Transcribe by chunks\n",
622
+ " pieces = []\n",
623
+ " for chunk in _chunks(wav, TARGET_SR, CHUNK_SEC, OVERLAP_SEC):\n",
624
+ " # processor for Wav2Vec2Bert expects raw audio -> input_features\n",
625
+ " inputs = processor(chunk.squeeze().numpy(), sampling_rate=TARGET_SR, return_tensors=\"pt\").to(device)\n",
626
+ " with torch.no_grad():\n",
627
+ " logits = model(input_features=inputs.input_features).logits # [1, T, V]\n",
628
+ " logp = logits[0].cpu().numpy()\n",
629
+ "\n",
630
+ " if decoder is not None:\n",
631
+ " hypo = decoder.decode(logp, beam_width=BEAM_WIDTH)\n",
632
+ " else:\n",
633
+ " # Greedy fallback if pyctcdecode not available\n",
634
+ " ids = logp.argmax(axis=-1)\n",
635
+ " tokens = processor.tokenizer.convert_ids_to_tokens(ids.tolist())\n",
636
+ " hypo = \"\".join(tokens)\n",
637
+ "\n",
638
+ " if hypo.strip():\n",
639
+ " pieces.append(hypo.strip())\n",
640
+ "\n",
641
+ " # cleanup per chunk\n",
642
+ " del inputs, logits, logp\n",
643
+ "\n",
644
+ " text = _postprocess(\" \".join(pieces))\n",
645
+ " return text\n",
646
+ "\n",
647
+ "if __name__ == \"__main__\":\n",
648
+ " out = transcribe_one_file()\n",
649
+ " if OUTPUT_TXT:\n",
650
+ " os.makedirs(os.path.dirname(OUTPUT_TXT), exist_ok=True)\n",
651
+ " with open(OUTPUT_TXT, \"w\", encoding=\"utf-8\") as f:\n",
652
+ " f.write(out + \"\\n\")\n",
653
+ " print(out)\n"
654
+ ],
655
+ "metadata": {
656
+ "colab": {
657
+ "base_uri": "https://localhost:8080/"
658
+ },
659
+ "id": "W1rQvavueaBI",
660
+ "outputId": "76136358-df59-4f98-bfcc-8e01dc51ed6d"
661
+ },
662
+ "execution_count": 25,
663
+ "outputs": [
664
+ {
665
+ "output_type": "stream",
666
+ "name": "stdout",
667
+ "text": [
668
+ "αˆ•αˆ­αŒŠαŒŽ ምነ αˆαŠ• α‹˜α‰ αŠ• αŠ α‰΅αˆ«αŠ­ αŠ₯αŠ•α‹΄ αŠ αŠ•α‰ α‰°α‰΅ αŠ₯α‰₯ αˆ˜αˆ¨α‰£α‰€α‹ ግሩም αˆˆα‰΅αˆαˆŒ αˆ˜α‹²αŠα‰΅ α‰°.\n"
669
+ ]
670
+ }
671
+ ]
672
+ },
673
+ {
674
+ "cell_type": "code",
675
+ "source": [],
676
+ "metadata": {
677
+ "id": "qs6x1lHOlthS"
678
+ },
679
+ "execution_count": null,
680
+ "outputs": []
681
+ }
682
+ ]
683
+ }