Upload transcribe.ipynb

Browse files

Files changed (1) hide show

transcribe.ipynb +683 -1

transcribe.ipynb CHANGED Viewed

	@@ -1 +1,683 @@
1	- {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"machine_shape":"hm","mount_file_id":"15JwXGAHSNDvOfhIDFaj8h2bt5FijajiD","authorship_tag":"ABX9TyOIpJiulD+u85Vw+4eh9A8m"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","source":[],"metadata":{"id":"Pj64tkijY4tT","executionInfo":{"status":"ok","timestamp":1756685653771,"user_tz":240,"elapsed":5,"user":{"displayName":"Beshir Ibrahim","userId":"16736839346810179639"}}},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["%%capture\n","# Core libraries\n","!pip install torch torchaudio transformers pydub numpy pyctcdecode\n","# If you need mp3 input support\n","!sudo apt-get update -qq\n","!sudo apt-get install -y ffmpeg\n","# For KenLM ARPA/bin support\n","!pip install https://github.com/kpu/kenlm/archive/master.zip"],"metadata":{"id":"d6IIQn8_hEAy","executionInfo":{"status":"ok","timestamp":1756686608615,"user_tz":240,"elapsed":56491,"user":{"displayName":"Beshir Ibrahim","userId":"16736839346810179639"}}},"execution_count":9,"outputs":[]},{"cell_type":"code","source":["MODEL_PATH = \"/content/drive/MyDrive/artifacts/models/hf/hf_tgt/tigre-asr-Wav2Vec2Bert\" # model and processor path\n","PROCESSOR_PATH = MODEL_PATH\n","AUDIO_FILE = MODEL_PATH+\"/sample.wav\"\n","OUTPUT_TXT = None # e.g., \"/path/to/out.txt\" or None to just print\n","# KenLM + lexicon (optional but recommended for beam search)\n","KENLM_ARPA = MODEL_PATH+\"/lm.arpa\" # set to None to decode WITHOUT LM\n","LEXICON_TXT = MODEL_PATH+\"/lexicon.txt\" # used to load unigrams; set to None if not available"],"metadata":{"id":"B81FMlsQlSOh","executionInfo":{"status":"ok","timestamp":1756686366477,"user_tz":240,"elapsed":13,"user":{"displayName":"Beshir Ibrahim","userId":"16736839346810179639"}}},"execution_count":7,"outputs":[]},{"cell_type":"code","source":["import warnings\n","import logging\n","\n","# Silence all Python warnings\n","warnings.filterwarnings(\"ignore\")\n","# Silence pyctcdecode logger\n","logging.getLogger(\"pyctcdecode\").setLevel(logging.ERROR)\n","# Silence torchaudio warnings (optionally all)\n","logging.getLogger(\"torchaudio\").setLevel(logging.ERROR)"],"metadata":{"id":"Y90co7BOmK9n","executionInfo":{"status":"ok","timestamp":1756685692591,"user_tz":240,"elapsed":6,"user":{"displayName":"Beshir Ibrahim","userId":"16736839346810179639"}}},"execution_count":2,"outputs":[]},{"cell_type":"code","source":["# Audio / chunking\n","TARGET_SR = 16000\n","CHUNK_SEC = 5 # chunk length in seconds\n","OVERLAP_SEC = 0 # overlap between chunks in seconds (0 for minimal code)\n","# Beam search params\n","BEAM_WIDTH = 150\n","LM_ALPHA = 0.5\n","LM_BETA = 1.0"],"metadata":{"id":"7DOmsFxbnzwK","executionInfo":{"status":"ok","timestamp":1756685693508,"user_tz":240,"elapsed":5,"user":{"displayName":"Beshir Ibrahim","userId":"16736839346810179639"}}},"execution_count":3,"outputs":[]},{"cell_type":"code","source":["import os\n","import torch\n","import numpy as np\n","import torchaudio\n","from typing import List, Optional\n","\n","# Use pydub for robust mp3 handling\n","from pydub import AudioSegment\n","\n","from transformers import Wav2Vec2BertForCTC, Wav2Vec2BertProcessor\n","\n","# Optional LM decoding\n","try:\n"," from pyctcdecode import build_ctcdecoder\n"," _HAS_PYCTC = True\n","except Exception:\n"," _HAS_PYCTC = False\n","\n","# Pick device\n","device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n","\n","def _load_audio(path: str, target_sr: int = 16000) -> torch.Tensor:\n"," \"\"\"Load WAV or MP3 to mono float32 tensor [1, T] at target_sr.\"\"\"\n"," ext = os.path.splitext(path)[1].lower()\n"," if ext == \".mp3\":\n"," audio = AudioSegment.from_file(path, format=\"mp3\")\n"," audio = audio.set_channels(1).set_frame_rate(target_sr)\n"," samples = np.array(audio.get_array_of_samples()).astype(np.float32)\n"," # pydub gives int PCM range; normalize if needed (assume 16-bit)\n"," if samples.dtype != np.float32:\n"," samples = samples.astype(np.float32)\n"," # If sample_width==2 (16-bit), divide by 32768\n"," if audio.sample_width == 2:\n"," samples /= 32768.0\n"," return torch.from_numpy(samples).unsqueeze(0)\n"," else:\n"," wav, sr = torchaudio.load(path)\n"," if wav.shape[0] > 1:\n"," wav = wav.mean(dim=0, keepdim=True) # stereo -> mono\n"," if sr != target_sr:\n"," wav = torchaudio.transforms.Resample(sr, target_sr)(wav)\n"," # ensure float32 in [-1,1]\n"," if wav.dtype != torch.float32:\n"," wav = wav.to(torch.float32)\n"," return wav\n","\n","def _chunks(wave: torch.Tensor, sr: int, chunk_sec: int, overlap_sec: int):\n"," \"\"\"Yield possibly-overlapping chunks [1, T_chunk].\"\"\"\n"," chunk = int(chunk_sec * sr)\n"," step = max(1, chunk - int(overlap_sec * sr))\n"," T = wave.size(-1)\n"," for start in range(0, T, step):\n"," end = min(start + chunk, T)\n"," yield wave[:, start:end]\n"," if end >= T:\n"," break\n","\n","def _load_unigrams(lexicon_path: Optional[str]) -> List[str]:\n"," \"\"\"Read first token per line from lexicon into a unigram list.\"\"\"\n"," if not lexicon_path or not os.path.exists(lexicon_path):\n"," return []\n"," words = set()\n"," with open(lexicon_path, \"r\", encoding=\"utf-8\") as f:\n"," for line in f:\n"," w = line.strip().split()\n"," if w:\n"," words.add(w[0])\n"," return sorted(words)\n","\n","def _build_decoder(model, processor):\n"," \"\"\"Build a pyctcdecode decoder from model vocab + KenLM (if configured).\"\"\"\n"," # Build vocab (id -> token)\n"," vocab_size = model.lm_head.out_features\n"," labels = []\n"," for i in range(vocab_size):\n"," tok = processor.tokenizer.convert_ids_to_tokens([i])[0]\n"," # remove common BPE markers\n"," tok = tok.lstrip(\"Ġ\").lstrip(\"▁\")\n"," labels.append(tok)\n","\n"," # No LM? Use labels only; with LM? also pass unigrams + alpha/beta\n"," if not _HAS_PYCTC:\n"," return None\n","\n"," if KENLM_ARPA and os.path.exists(KENLM_ARPA):\n"," unigrams = _load_unigrams(LEXICON_TXT)\n"," return build_ctcdecoder(\n"," labels=labels,\n"," kenlm_model_path=KENLM_ARPA,\n"," unigrams=unigrams if unigrams else None,\n"," alpha=LM_ALPHA,\n"," beta=LM_BETA\n"," )\n"," else:\n"," # Fallback to lexicon-less decoder (greedy-ish beam without LM)\n"," return build_ctcdecoder(labels=labels)\n","\n","def _postprocess(text: str) -> str:\n"," \"\"\"Light cleanup: strip special markers, collapse dup words, ensure end punctuation.\"\"\"\n"," text = text.replace(\"<\|\", \"\").replace(\"\|>\", \"\").replace(\"<>\", \"\").strip()\n"," words, cleaned = text.split(), []\n"," for w in words:\n"," if not cleaned or cleaned[-1] != w:\n"," cleaned.append(w)\n"," out = \" \".join(cleaned).strip()\n"," if out and out[-1] not in \".!?\":\n"," out += \".\"\n"," return out\n","\n","def transcribe_one_file() -> str:\n"," # Load model + processor\n"," model = Wav2Vec2BertForCTC.from_pretrained(MODEL_PATH).to(device).eval()\n"," processor = Wav2Vec2BertProcessor.from_pretrained(PROCESSOR_PATH)\n","\n"," # Optional decoder\n"," decoder = _build_decoder(model, processor)\n","\n"," # Load audio\n"," wav = _load_audio(AUDIO_FILE, TARGET_SR)\n","\n"," # Transcribe by chunks\n"," pieces = []\n"," for chunk in _chunks(wav, TARGET_SR, CHUNK_SEC, OVERLAP_SEC):\n"," # processor for Wav2Vec2Bert expects raw audio -> input_features\n"," inputs = processor(chunk.squeeze().numpy(), sampling_rate=TARGET_SR, return_tensors=\"pt\").to(device)\n"," with torch.no_grad():\n"," logits = model(input_features=inputs.input_features).logits # [1, T, V]\n"," logp = logits[0].cpu().numpy()\n","\n"," if decoder is not None:\n"," hypo = decoder.decode(logp, beam_width=BEAM_WIDTH)\n"," else:\n"," # Greedy fallback if pyctcdecode not available\n"," ids = logp.argmax(axis=-1)\n"," tokens = processor.tokenizer.convert_ids_to_tokens(ids.tolist())\n"," hypo = \"\".join(tokens)\n","\n"," if hypo.strip():\n"," pieces.append(hypo.strip())\n","\n"," # cleanup per chunk\n"," del inputs, logits, logp\n","\n"," text = _postprocess(\" \".join(pieces))\n"," return text\n","\n","if __name__ == \"__main__\":\n"," out = transcribe_one_file()\n"," if OUTPUT_TXT:\n"," os.makedirs(os.path.dirname(OUTPUT_TXT), exist_ok=True)\n"," with open(OUTPUT_TXT, \"w\", encoding=\"utf-8\") as f:\n"," f.write(out + \"\\n\")\n"," print(out)\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"W1rQvavueaBI","executionInfo":{"status":"ok","timestamp":1756686391018,"user_tz":240,"elapsed":15969,"user":{"displayName":"Beshir Ibrahim","userId":"16736839346810179639"}},"outputId":"8c5a1a6a-dd57-4b82-f891-0f4e45945a93"},"execution_count":8,"outputs":[{"output_type":"stream","name":"stdout","text":["ሕርጊጎ ምነ ምን ዘበን አትራክ እንዴ አንበተት እብ መረባቤዐ ግሩም ለትሐሌ መዲነት ተ.\n"]}]},{"cell_type":"code","source":[],"metadata":{"id":"qs6x1lHOlthS"},"execution_count":null,"outputs":[]}]}

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "machine_shape": "hm"
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "widgets": {
+      "application/vnd.jupyter.widget-state+json": {
+        "ce0bb5a384f24c0e8a1d1b2620d34f6b": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HBoxModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_33cc7addcccd495f820391aeb5d08dd0",
+              "IPY_MODEL_ca564eec03874e6487ab7559580bba74",
+              "IPY_MODEL_143a309bafb64dca81327f331ace379e"
+            ],
+            "layout": "IPY_MODEL_f6db87a6251145f08348dfe0fd503ac1"
+          }
+        },
+        "33cc7addcccd495f820391aeb5d08dd0": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_77ad67ab59704d4abff46bbde1fdf8cc",
+            "placeholder": "",
+            "style": "IPY_MODEL_8807473008224ebb92932a247d95ad6b",
+            "value": "Fetching 12 files: 100%"
+          }
+        },
+        "ca564eec03874e6487ab7559580bba74": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "FloatProgressModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_4ac03a80fc1a45fc80a9bbc1e746ab21",
+            "max": 12,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_fd67dc9a87864db6998a5d0de5b7cc59",
+            "value": 12
+          }
+        },
+        "143a309bafb64dca81327f331ace379e": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_50586c1a6dc340e5aba4e8dd6a1e2fcb",
+            "placeholder": "",
+            "style": "IPY_MODEL_a51b63ac35054c1786e6b9d032fdc2ca",
+            "value": " 12/12 [00:00&lt;00:00, 812.26it/s]"
+          }
+        },
+        "f6db87a6251145f08348dfe0fd503ac1": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "77ad67ab59704d4abff46bbde1fdf8cc": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "8807473008224ebb92932a247d95ad6b": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "4ac03a80fc1a45fc80a9bbc1e746ab21": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "fd67dc9a87864db6998a5d0de5b7cc59": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "ProgressStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "50586c1a6dc340e5aba4e8dd6a1e2fcb": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "a51b63ac35054c1786e6b9d032fdc2ca": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        }
+      }
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "Pj64tkijY4tT"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%%capture\n",
+        "# Core libraries\n",
+        "!pip install torch torchaudio transformers pydub numpy pyctcdecode\n",
+        "# If you need mp3 input support\n",
+        "!sudo apt-get update -qq\n",
+        "!sudo apt-get install -y ffmpeg\n",
+        "# For KenLM ARPA/bin support\n",
+        "!pip install https://github.com/kpu/kenlm/archive/master.zip"
+      ],
+      "metadata": {
+        "id": "d6IIQn8_hEAy"
+      },
+      "execution_count": 9,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "local_dir      = \"/content/tigre-asr-Wav2Vec2Bert\"  # or any local path\n",
+        "OUTPUT_TXT     = None                                   # e.g., \"/path/to/out.txt\" or None to just print"
+      ],
+      "metadata": {
+        "id": "5j0nwQYG2B4T"
+      },
+      "execution_count": 21,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from huggingface_hub import snapshot_download\n",
+        "\n",
+        "MODEL_PATH     = \"BeitTigreAI/tigre-asr-Wav2Vec2Bert\"\n",
+        "PROCESSOR_PATH = MODEL_PATH\n",
+        "\n",
+        "snapshot_download(\n",
+        "    repo_id                 = MODEL_PATH,\n",
+        "    repo_type               = \"model\",\n",
+        "    local_dir               = local_dir,\n",
+        "    local_dir_use_symlinks  = False  # copies files fully\n",
+        ")\n",
+        "\n",
+        "AUDIO_FILE = f\"{local_dir}/sample.wav\"\n",
+        "KENLM_ARPA = f\"{local_dir}/lm.arpa\"     # if uploaded\n",
+        "LEXICON_TXT = f\"{local_dir}/lexicon.txt\""
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 49,
+          "referenced_widgets": [
+            "ce0bb5a384f24c0e8a1d1b2620d34f6b",
+            "33cc7addcccd495f820391aeb5d08dd0",
+            "ca564eec03874e6487ab7559580bba74",
+            "143a309bafb64dca81327f331ace379e",
+            "f6db87a6251145f08348dfe0fd503ac1",
+            "77ad67ab59704d4abff46bbde1fdf8cc",
+            "8807473008224ebb92932a247d95ad6b",
+            "4ac03a80fc1a45fc80a9bbc1e746ab21",
+            "fd67dc9a87864db6998a5d0de5b7cc59",
+            "50586c1a6dc340e5aba4e8dd6a1e2fcb",
+            "a51b63ac35054c1786e6b9d032fdc2ca"
+          ]
+        },
+        "id": "WNhgqJET08VF",
+        "outputId": "559f767c-d269-4908-ca5e-1bffb8c39395"
+      },
+      "execution_count": 22,
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [
+              "Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]"
+            ],
+            "application/vnd.jupyter.widget-view+json": {
+              "version_major": 2,
+              "version_minor": 0,
+              "model_id": "ce0bb5a384f24c0e8a1d1b2620d34f6b"
+            }
+          },
+          "metadata": {}
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import warnings\n",
+        "import logging\n",
+        "\n",
+        "# Silence all Python warnings\n",
+        "warnings.filterwarnings(\"ignore\")\n",
+        "# Silence pyctcdecode logger\n",
+        "logging.getLogger(\"pyctcdecode\").setLevel(logging.ERROR)\n",
+        "# Silence torchaudio warnings (optionally all)\n",
+        "logging.getLogger(\"torchaudio\").setLevel(logging.ERROR)"
+      ],
+      "metadata": {
+        "id": "Y90co7BOmK9n"
+      },
+      "execution_count": 23,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Audio / chunking\n",
+        "TARGET_SR      = 16000\n",
+        "CHUNK_SEC      = 5        # chunk length in seconds\n",
+        "OVERLAP_SEC    = 0        # overlap between chunks in seconds (0 for minimal code)\n",
+        "# Beam search params\n",
+        "BEAM_WIDTH     = 150\n",
+        "LM_ALPHA       = 0.5\n",
+        "LM_BETA        = 1.0"
+      ],
+      "metadata": {
+        "id": "7DOmsFxbnzwK"
+      },
+      "execution_count": 24,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import os\n",
+        "import torch\n",
+        "import numpy as np\n",
+        "import torchaudio\n",
+        "from typing import List, Optional\n",
+        "\n",
+        "# Use pydub for robust mp3 handling\n",
+        "from pydub import AudioSegment\n",
+        "\n",
+        "from transformers import Wav2Vec2BertForCTC, Wav2Vec2BertProcessor\n",
+        "\n",
+        "# Optional LM decoding\n",
+        "try:\n",
+        "    from pyctcdecode import build_ctcdecoder\n",
+        "    _HAS_PYCTC = True\n",
+        "except Exception:\n",
+        "    _HAS_PYCTC = False\n",
+        "\n",
+        "# Pick device\n",
+        "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+        "\n",
+        "def _load_audio(path: str, target_sr: int = 16000) -> torch.Tensor:\n",
+        "    \"\"\"Load WAV or MP3 to mono float32 tensor [1, T] at target_sr.\"\"\"\n",
+        "    ext = os.path.splitext(path)[1].lower()\n",
+        "    if ext == \".mp3\":\n",
+        "        audio = AudioSegment.from_file(path, format=\"mp3\")\n",
+        "        audio = audio.set_channels(1).set_frame_rate(target_sr)\n",
+        "        samples = np.array(audio.get_array_of_samples()).astype(np.float32)\n",
+        "        # pydub gives int PCM range; normalize if needed (assume 16-bit)\n",
+        "        if samples.dtype != np.float32:\n",
+        "            samples = samples.astype(np.float32)\n",
+        "        # If sample_width==2 (16-bit), divide by 32768\n",
+        "        if audio.sample_width == 2:\n",
+        "            samples /= 32768.0\n",
+        "        return torch.from_numpy(samples).unsqueeze(0)\n",
+        "    else:\n",
+        "        wav, sr = torchaudio.load(path)\n",
+        "        if wav.shape[0] > 1:\n",
+        "            wav = wav.mean(dim=0, keepdim=True)  # stereo -> mono\n",
+        "        if sr != target_sr:\n",
+        "            wav = torchaudio.transforms.Resample(sr, target_sr)(wav)\n",
+        "        # ensure float32 in [-1,1]\n",
+        "        if wav.dtype != torch.float32:\n",
+        "            wav = wav.to(torch.float32)\n",
+        "        return wav\n",
+        "\n",
+        "def _chunks(wave: torch.Tensor, sr: int, chunk_sec: int, overlap_sec: int):\n",
+        "    \"\"\"Yield possibly-overlapping chunks [1, T_chunk].\"\"\"\n",
+        "    chunk = int(chunk_sec * sr)\n",
+        "    step  = max(1, chunk - int(overlap_sec * sr))\n",
+        "    T     = wave.size(-1)\n",
+        "    for start in range(0, T, step):\n",
+        "        end = min(start + chunk, T)\n",
+        "        yield wave[:, start:end]\n",
+        "        if end >= T:\n",
+        "            break\n",
+        "\n",
+        "def _load_unigrams(lexicon_path: Optional[str]) -> List[str]:\n",
+        "    \"\"\"Read first token per line from lexicon into a unigram list.\"\"\"\n",
+        "    if not lexicon_path or not os.path.exists(lexicon_path):\n",
+        "        return []\n",
+        "    words = set()\n",
+        "    with open(lexicon_path, \"r\", encoding=\"utf-8\") as f:\n",
+        "        for line in f:\n",
+        "            w = line.strip().split()\n",
+        "            if w:\n",
+        "                words.add(w[0])\n",
+        "    return sorted(words)\n",
+        "\n",
+        "def _build_decoder(model, processor):\n",
+        "    \"\"\"Build a pyctcdecode decoder from model vocab + KenLM (if configured).\"\"\"\n",
+        "    # Build vocab (id -> token)\n",
+        "    vocab_size = model.lm_head.out_features\n",
+        "    labels = []\n",
+        "    for i in range(vocab_size):\n",
+        "        tok = processor.tokenizer.convert_ids_to_tokens([i])[0]\n",
+        "        # remove common BPE markers\n",
+        "        tok = tok.lstrip(\"Ġ\").lstrip(\"▁\")\n",
+        "        labels.append(tok)\n",
+        "\n",
+        "    # No LM? Use labels only; with LM? also pass unigrams + alpha/beta\n",
+        "    if not _HAS_PYCTC:\n",
+        "        return None\n",
+        "\n",
+        "    if KENLM_ARPA and os.path.exists(KENLM_ARPA):\n",
+        "        unigrams = _load_unigrams(LEXICON_TXT)\n",
+        "        return build_ctcdecoder(\n",
+        "            labels=labels,\n",
+        "            kenlm_model_path=KENLM_ARPA,\n",
+        "            unigrams=unigrams if unigrams else None,\n",
+        "            alpha=LM_ALPHA,\n",
+        "            beta=LM_BETA\n",
+        "        )\n",
+        "    else:\n",
+        "        # Fallback to lexicon-less decoder (greedy-ish beam without LM)\n",
+        "        return build_ctcdecoder(labels=labels)\n",
+        "\n",
+        "def _postprocess(text: str) -> str:\n",
+        "    \"\"\"Light cleanup: strip special markers, collapse dup words, ensure end punctuation.\"\"\"\n",
+        "    text = text.replace(\"<|\", \"\").replace(\"|>\", \"\").replace(\"<>\", \"\").strip()\n",
+        "    words, cleaned = text.split(), []\n",
+        "    for w in words:\n",
+        "        if not cleaned or cleaned[-1] != w:\n",
+        "            cleaned.append(w)\n",
+        "    out = \" \".join(cleaned).strip()\n",
+        "    if out and out[-1] not in \".!?\":\n",
+        "        out += \".\"\n",
+        "    return out\n",
+        "\n",
+        "def transcribe_one_file() -> str:\n",
+        "    # Load model + processor\n",
+        "    model = Wav2Vec2BertForCTC.from_pretrained(MODEL_PATH).to(device).eval()\n",
+        "    processor = Wav2Vec2BertProcessor.from_pretrained(PROCESSOR_PATH)\n",
+        "\n",
+        "    # Optional decoder\n",
+        "    decoder = _build_decoder(model, processor)\n",
+        "\n",
+        "    # Load audio\n",
+        "    wav = _load_audio(AUDIO_FILE, TARGET_SR)\n",
+        "\n",
+        "    # Transcribe by chunks\n",
+        "    pieces = []\n",
+        "    for chunk in _chunks(wav, TARGET_SR, CHUNK_SEC, OVERLAP_SEC):\n",
+        "        # processor for Wav2Vec2Bert expects raw audio -> input_features\n",
+        "        inputs = processor(chunk.squeeze().numpy(), sampling_rate=TARGET_SR, return_tensors=\"pt\").to(device)\n",
+        "        with torch.no_grad():\n",
+        "            logits = model(input_features=inputs.input_features).logits  # [1, T, V]\n",
+        "        logp = logits[0].cpu().numpy()\n",
+        "\n",
+        "        if decoder is not None:\n",
+        "            hypo = decoder.decode(logp, beam_width=BEAM_WIDTH)\n",
+        "        else:\n",
+        "            # Greedy fallback if pyctcdecode not available\n",
+        "            ids = logp.argmax(axis=-1)\n",
+        "            tokens = processor.tokenizer.convert_ids_to_tokens(ids.tolist())\n",
+        "            hypo = \"\".join(tokens)\n",
+        "\n",
+        "        if hypo.strip():\n",
+        "            pieces.append(hypo.strip())\n",
+        "\n",
+        "        # cleanup per chunk\n",
+        "        del inputs, logits, logp\n",
+        "\n",
+        "    text = _postprocess(\" \".join(pieces))\n",
+        "    return text\n",
+        "\n",
+        "if __name__ == \"__main__\":\n",
+        "    out = transcribe_one_file()\n",
+        "    if OUTPUT_TXT:\n",
+        "        os.makedirs(os.path.dirname(OUTPUT_TXT), exist_ok=True)\n",
+        "        with open(OUTPUT_TXT, \"w\", encoding=\"utf-8\") as f:\n",
+        "            f.write(out + \"\\n\")\n",
+        "    print(out)\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "W1rQvavueaBI",
+        "outputId": "76136358-df59-4f98-bfcc-8e01dc51ed6d"
+      },
+      "execution_count": 25,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "ሕርጊጎ ምነ ምን ዘበን አትራክ እንዴ አንበተት እብ መረባቤዐ ግሩም ለትሐሌ መዲነት ተ.\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "qs6x1lHOlthS"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}