{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"machine_shape":"hm","mount_file_id":"15JwXGAHSNDvOfhIDFaj8h2bt5FijajiD","authorship_tag":"ABX9TyOIpJiulD+u85Vw+4eh9A8m"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","source":[],"metadata":{"id":"Pj64tkijY4tT","executionInfo":{"status":"ok","timestamp":1756685653771,"user_tz":240,"elapsed":5,"user":{"displayName":"Beshir Ibrahim","userId":"16736839346810179639"}}},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["%%capture\n","# Core libraries\n","!pip install torch torchaudio transformers pydub numpy pyctcdecode\n","# If you need mp3 input support\n","!sudo apt-get update -qq\n","!sudo apt-get install -y ffmpeg\n","# For KenLM ARPA/bin support\n","!pip install https://github.com/kpu/kenlm/archive/master.zip"],"metadata":{"id":"d6IIQn8_hEAy","executionInfo":{"status":"ok","timestamp":1756686608615,"user_tz":240,"elapsed":56491,"user":{"displayName":"Beshir Ibrahim","userId":"16736839346810179639"}}},"execution_count":9,"outputs":[]},{"cell_type":"code","source":["MODEL_PATH = \"/content/drive/MyDrive/artifacts/models/hf/hf_tgt/tigre-asr-Wav2Vec2Bert\" # model and processor path\n","PROCESSOR_PATH = MODEL_PATH\n","AUDIO_FILE = MODEL_PATH+\"/sample.wav\"\n","OUTPUT_TXT = None # e.g., \"/path/to/out.txt\" or None to just print\n","# KenLM + lexicon (optional but recommended for beam search)\n","KENLM_ARPA = MODEL_PATH+\"/lm.arpa\" # set to None to decode WITHOUT LM\n","LEXICON_TXT = MODEL_PATH+\"/lexicon.txt\" # used to load unigrams; set to None if not available"],"metadata":{"id":"B81FMlsQlSOh","executionInfo":{"status":"ok","timestamp":1756686366477,"user_tz":240,"elapsed":13,"user":{"displayName":"Beshir Ibrahim","userId":"16736839346810179639"}}},"execution_count":7,"outputs":[]},{"cell_type":"code","source":["import warnings\n","import logging\n","\n","# Silence all Python warnings\n","warnings.filterwarnings(\"ignore\")\n","# Silence pyctcdecode logger\n","logging.getLogger(\"pyctcdecode\").setLevel(logging.ERROR)\n","# Silence torchaudio warnings (optionally all)\n","logging.getLogger(\"torchaudio\").setLevel(logging.ERROR)"],"metadata":{"id":"Y90co7BOmK9n","executionInfo":{"status":"ok","timestamp":1756685692591,"user_tz":240,"elapsed":6,"user":{"displayName":"Beshir Ibrahim","userId":"16736839346810179639"}}},"execution_count":2,"outputs":[]},{"cell_type":"code","source":["# Audio / chunking\n","TARGET_SR = 16000\n","CHUNK_SEC = 5 # chunk length in seconds\n","OVERLAP_SEC = 0 # overlap between chunks in seconds (0 for minimal code)\n","# Beam search params\n","BEAM_WIDTH = 150\n","LM_ALPHA = 0.5\n","LM_BETA = 1.0"],"metadata":{"id":"7DOmsFxbnzwK","executionInfo":{"status":"ok","timestamp":1756685693508,"user_tz":240,"elapsed":5,"user":{"displayName":"Beshir Ibrahim","userId":"16736839346810179639"}}},"execution_count":3,"outputs":[]},{"cell_type":"code","source":["import os\n","import torch\n","import numpy as np\n","import torchaudio\n","from typing import List, Optional\n","\n","# Use pydub for robust mp3 handling\n","from pydub import AudioSegment\n","\n","from transformers import Wav2Vec2BertForCTC, Wav2Vec2BertProcessor\n","\n","# Optional LM decoding\n","try:\n"," from pyctcdecode import build_ctcdecoder\n"," _HAS_PYCTC = True\n","except Exception:\n"," _HAS_PYCTC = False\n","\n","# Pick device\n","device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n","\n","def _load_audio(path: str, target_sr: int = 16000) -> torch.Tensor:\n"," \"\"\"Load WAV or MP3 to mono float32 tensor [1, T] at target_sr.\"\"\"\n"," ext = os.path.splitext(path)[1].lower()\n"," if ext == \".mp3\":\n"," audio = AudioSegment.from_file(path, format=\"mp3\")\n"," audio = audio.set_channels(1).set_frame_rate(target_sr)\n"," samples = np.array(audio.get_array_of_samples()).astype(np.float32)\n"," # pydub gives int PCM range; normalize if needed (assume 16-bit)\n"," if samples.dtype != np.float32:\n"," samples = samples.astype(np.float32)\n"," # If sample_width==2 (16-bit), divide by 32768\n"," if audio.sample_width == 2:\n"," samples /= 32768.0\n"," return torch.from_numpy(samples).unsqueeze(0)\n"," else:\n"," wav, sr = torchaudio.load(path)\n"," if wav.shape[0] > 1:\n"," wav = wav.mean(dim=0, keepdim=True) # stereo -> mono\n"," if sr != target_sr:\n"," wav = torchaudio.transforms.Resample(sr, target_sr)(wav)\n"," # ensure float32 in [-1,1]\n"," if wav.dtype != torch.float32:\n"," wav = wav.to(torch.float32)\n"," return wav\n","\n","def _chunks(wave: torch.Tensor, sr: int, chunk_sec: int, overlap_sec: int):\n"," \"\"\"Yield possibly-overlapping chunks [1, T_chunk].\"\"\"\n"," chunk = int(chunk_sec * sr)\n"," step = max(1, chunk - int(overlap_sec * sr))\n"," T = wave.size(-1)\n"," for start in range(0, T, step):\n"," end = min(start + chunk, T)\n"," yield wave[:, start:end]\n"," if end >= T:\n"," break\n","\n","def _load_unigrams(lexicon_path: Optional[str]) -> List[str]:\n"," \"\"\"Read first token per line from lexicon into a unigram list.\"\"\"\n"," if not lexicon_path or not os.path.exists(lexicon_path):\n"," return []\n"," words = set()\n"," with open(lexicon_path, \"r\", encoding=\"utf-8\") as f:\n"," for line in f:\n"," w = line.strip().split()\n"," if w:\n"," words.add(w[0])\n"," return sorted(words)\n","\n","def _build_decoder(model, processor):\n"," \"\"\"Build a pyctcdecode decoder from model vocab + KenLM (if configured).\"\"\"\n"," # Build vocab (id -> token)\n"," vocab_size = model.lm_head.out_features\n"," labels = []\n"," for i in range(vocab_size):\n"," tok = processor.tokenizer.convert_ids_to_tokens([i])[0]\n"," # remove common BPE markers\n"," tok = tok.lstrip(\"Ġ\").lstrip(\"▁\")\n"," labels.append(tok)\n","\n"," # No LM? Use labels only; with LM? also pass unigrams + alpha/beta\n"," if not _HAS_PYCTC:\n"," return None\n","\n"," if KENLM_ARPA and os.path.exists(KENLM_ARPA):\n"," unigrams = _load_unigrams(LEXICON_TXT)\n"," return build_ctcdecoder(\n"," labels=labels,\n"," kenlm_model_path=KENLM_ARPA,\n"," unigrams=unigrams if unigrams else None,\n"," alpha=LM_ALPHA,\n"," beta=LM_BETA\n"," )\n"," else:\n"," # Fallback to lexicon-less decoder (greedy-ish beam without LM)\n"," return build_ctcdecoder(labels=labels)\n","\n","def _postprocess(text: str) -> str:\n"," \"\"\"Light cleanup: strip special markers, collapse dup words, ensure end punctuation.\"\"\"\n"," text = text.replace(\"<|\", \"\").replace(\"|>\", \"\").replace(\"<>\", \"\").strip()\n"," words, cleaned = text.split(), []\n"," for w in words:\n"," if not cleaned or cleaned[-1] != w:\n"," cleaned.append(w)\n"," out = \" \".join(cleaned).strip()\n"," if out and out[-1] not in \".!?\":\n"," out += \".\"\n"," return out\n","\n","def transcribe_one_file() -> str:\n"," # Load model + processor\n"," model = Wav2Vec2BertForCTC.from_pretrained(MODEL_PATH).to(device).eval()\n"," processor = Wav2Vec2BertProcessor.from_pretrained(PROCESSOR_PATH)\n","\n"," # Optional decoder\n"," decoder = _build_decoder(model, processor)\n","\n"," # Load audio\n"," wav = _load_audio(AUDIO_FILE, TARGET_SR)\n","\n"," # Transcribe by chunks\n"," pieces = []\n"," for chunk in _chunks(wav, TARGET_SR, CHUNK_SEC, OVERLAP_SEC):\n"," # processor for Wav2Vec2Bert expects raw audio -> input_features\n"," inputs = processor(chunk.squeeze().numpy(), sampling_rate=TARGET_SR, return_tensors=\"pt\").to(device)\n"," with torch.no_grad():\n"," logits = model(input_features=inputs.input_features).logits # [1, T, V]\n"," logp = logits[0].cpu().numpy()\n","\n"," if decoder is not None:\n"," hypo = decoder.decode(logp, beam_width=BEAM_WIDTH)\n"," else:\n"," # Greedy fallback if pyctcdecode not available\n"," ids = logp.argmax(axis=-1)\n"," tokens = processor.tokenizer.convert_ids_to_tokens(ids.tolist())\n"," hypo = \"\".join(tokens)\n","\n"," if hypo.strip():\n"," pieces.append(hypo.strip())\n","\n"," # cleanup per chunk\n"," del inputs, logits, logp\n","\n"," text = _postprocess(\" \".join(pieces))\n"," return text\n","\n","if __name__ == \"__main__\":\n"," out = transcribe_one_file()\n"," if OUTPUT_TXT:\n"," os.makedirs(os.path.dirname(OUTPUT_TXT), exist_ok=True)\n"," with open(OUTPUT_TXT, \"w\", encoding=\"utf-8\") as f:\n"," f.write(out + \"\\n\")\n"," print(out)\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"W1rQvavueaBI","executionInfo":{"status":"ok","timestamp":1756686391018,"user_tz":240,"elapsed":15969,"user":{"displayName":"Beshir Ibrahim","userId":"16736839346810179639"}},"outputId":"8c5a1a6a-dd57-4b82-f891-0f4e45945a93"},"execution_count":8,"outputs":[{"output_type":"stream","name":"stdout","text":["ሕርጊጎ ምነ ምን ዘበን አትራክ እንዴ አንበተት እብ መረባቤዐ ግሩም ለትሐሌ መዲነት ተ.\n"]}]},{"cell_type":"code","source":[],"metadata":{"id":"qs6x1lHOlthS"},"execution_count":null,"outputs":[]}]}