{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"machine_shape":"hm","mount_file_id":"15JwXGAHSNDvOfhIDFaj8h2bt5FijajiD","authorship_tag":"ABX9TyOIpJiulD+u85Vw+4eh9A8m"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","source":[],"metadata":{"id":"Pj64tkijY4tT","executionInfo":{"status":"ok","timestamp":1756685653771,"user_tz":240,"elapsed":5,"user":{"displayName":"Beshir Ibrahim","userId":"16736839346810179639"}}},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["%%capture\n","# Core libraries\n","!pip install torch torchaudio transformers pydub numpy pyctcdecode\n","# If you need mp3 input support\n","!sudo apt-get update -qq\n","!sudo apt-get install -y ffmpeg\n","# For KenLM ARPA/bin support\n","!pip install https://github.com/kpu/kenlm/archive/master.zip"],"metadata":{"id":"d6IIQn8_hEAy","executionInfo":{"status":"ok","timestamp":1756686608615,"user_tz":240,"elapsed":56491,"user":{"displayName":"Beshir Ibrahim","userId":"16736839346810179639"}}},"execution_count":9,"outputs":[]},{"cell_type":"code","source":["MODEL_PATH     = \"/content/drive/MyDrive/artifacts/models/hf/hf_tgt/tigre-asr-Wav2Vec2Bert\"              # model and processor path\n","PROCESSOR_PATH = MODEL_PATH\n","AUDIO_FILE     = MODEL_PATH+\"/sample.wav\"\n","OUTPUT_TXT     = None                                   # e.g., \"/path/to/out.txt\" or None to just print\n","# KenLM + lexicon (optional but recommended for beam search)\n","KENLM_ARPA     = MODEL_PATH+\"/lm.arpa\"                  # set to None to decode WITHOUT LM\n","LEXICON_TXT    = MODEL_PATH+\"/lexicon.txt\"              # used to load unigrams; set to None if not available"],"metadata":{"id":"B81FMlsQlSOh","executionInfo":{"status":"ok","timestamp":1756686366477,"user_tz":240,"elapsed":13,"user":{"displayName":"Beshir Ibrahim","userId":"16736839346810179639"}}},"execution_count":7,"outputs":[]},{"cell_type":"code","source":["import warnings\n","import logging\n","\n","# Silence all Python warnings\n","warnings.filterwarnings(\"ignore\")\n","# Silence pyctcdecode logger\n","logging.getLogger(\"pyctcdecode\").setLevel(logging.ERROR)\n","# Silence torchaudio warnings (optionally all)\n","logging.getLogger(\"torchaudio\").setLevel(logging.ERROR)"],"metadata":{"id":"Y90co7BOmK9n","executionInfo":{"status":"ok","timestamp":1756685692591,"user_tz":240,"elapsed":6,"user":{"displayName":"Beshir Ibrahim","userId":"16736839346810179639"}}},"execution_count":2,"outputs":[]},{"cell_type":"code","source":["# Audio / chunking\n","TARGET_SR      = 16000\n","CHUNK_SEC      = 5        # chunk length in seconds\n","OVERLAP_SEC    = 0        # overlap between chunks in seconds (0 for minimal code)\n","# Beam search params\n","BEAM_WIDTH     = 150\n","LM_ALPHA       = 0.5\n","LM_BETA        = 1.0"],"metadata":{"id":"7DOmsFxbnzwK","executionInfo":{"status":"ok","timestamp":1756685693508,"user_tz":240,"elapsed":5,"user":{"displayName":"Beshir Ibrahim","userId":"16736839346810179639"}}},"execution_count":3,"outputs":[]},{"cell_type":"code","source":["import os\n","import torch\n","import numpy as np\n","import torchaudio\n","from typing import List, Optional\n","\n","# Use pydub for robust mp3 handling\n","from pydub import AudioSegment\n","\n","from transformers import Wav2Vec2BertForCTC, Wav2Vec2BertProcessor\n","\n","# Optional LM decoding\n","try:\n","    from pyctcdecode import build_ctcdecoder\n","    _HAS_PYCTC = True\n","except Exception:\n","    _HAS_PYCTC = False\n","\n","# Pick device\n","device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n","\n","def _load_audio(path: str, target_sr: int = 16000) -> torch.Tensor:\n","    \"\"\"Load WAV or MP3 to mono float32 tensor [1, T] at target_sr.\"\"\"\n","    ext = os.path.splitext(path)[1].lower()\n","    if ext == \".mp3\":\n","        audio = AudioSegment.from_file(path, format=\"mp3\")\n","        audio = audio.set_channels(1).set_frame_rate(target_sr)\n","        samples = np.array(audio.get_array_of_samples()).astype(np.float32)\n","        # pydub gives int PCM range; normalize if needed (assume 16-bit)\n","        if samples.dtype != np.float32:\n","            samples = samples.astype(np.float32)\n","        # If sample_width==2 (16-bit), divide by 32768\n","        if audio.sample_width == 2:\n","            samples /= 32768.0\n","        return torch.from_numpy(samples).unsqueeze(0)\n","    else:\n","        wav, sr = torchaudio.load(path)\n","        if wav.shape[0] > 1:\n","            wav = wav.mean(dim=0, keepdim=True)  # stereo -> mono\n","        if sr != target_sr:\n","            wav = torchaudio.transforms.Resample(sr, target_sr)(wav)\n","        # ensure float32 in [-1,1]\n","        if wav.dtype != torch.float32:\n","            wav = wav.to(torch.float32)\n","        return wav\n","\n","def _chunks(wave: torch.Tensor, sr: int, chunk_sec: int, overlap_sec: int):\n","    \"\"\"Yield possibly-overlapping chunks [1, T_chunk].\"\"\"\n","    chunk = int(chunk_sec * sr)\n","    step  = max(1, chunk - int(overlap_sec * sr))\n","    T     = wave.size(-1)\n","    for start in range(0, T, step):\n","        end = min(start + chunk, T)\n","        yield wave[:, start:end]\n","        if end >= T:\n","            break\n","\n","def _load_unigrams(lexicon_path: Optional[str]) -> List[str]:\n","    \"\"\"Read first token per line from lexicon into a unigram list.\"\"\"\n","    if not lexicon_path or not os.path.exists(lexicon_path):\n","        return []\n","    words = set()\n","    with open(lexicon_path, \"r\", encoding=\"utf-8\") as f:\n","        for line in f:\n","            w = line.strip().split()\n","            if w:\n","                words.add(w[0])\n","    return sorted(words)\n","\n","def _build_decoder(model, processor):\n","    \"\"\"Build a pyctcdecode decoder from model vocab + KenLM (if configured).\"\"\"\n","    # Build vocab (id -> token)\n","    vocab_size = model.lm_head.out_features\n","    labels = []\n","    for i in range(vocab_size):\n","        tok = processor.tokenizer.convert_ids_to_tokens([i])[0]\n","        # remove common BPE markers\n","        tok = tok.lstrip(\"Ġ\").lstrip(\"▁\")\n","        labels.append(tok)\n","\n","    # No LM? Use labels only; with LM? also pass unigrams + alpha/beta\n","    if not _HAS_PYCTC:\n","        return None\n","\n","    if KENLM_ARPA and os.path.exists(KENLM_ARPA):\n","        unigrams = _load_unigrams(LEXICON_TXT)\n","        return build_ctcdecoder(\n","            labels=labels,\n","            kenlm_model_path=KENLM_ARPA,\n","            unigrams=unigrams if unigrams else None,\n","            alpha=LM_ALPHA,\n","            beta=LM_BETA\n","        )\n","    else:\n","        # Fallback to lexicon-less decoder (greedy-ish beam without LM)\n","        return build_ctcdecoder(labels=labels)\n","\n","def _postprocess(text: str) -> str:\n","    \"\"\"Light cleanup: strip special markers, collapse dup words, ensure end punctuation.\"\"\"\n","    text = text.replace(\"<|\", \"\").replace(\"|>\", \"\").replace(\"<>\", \"\").strip()\n","    words, cleaned = text.split(), []\n","    for w in words:\n","        if not cleaned or cleaned[-1] != w:\n","            cleaned.append(w)\n","    out = \" \".join(cleaned).strip()\n","    if out and out[-1] not in \".!?\":\n","        out += \".\"\n","    return out\n","\n","def transcribe_one_file() -> str:\n","    # Load model + processor\n","    model = Wav2Vec2BertForCTC.from_pretrained(MODEL_PATH).to(device).eval()\n","    processor = Wav2Vec2BertProcessor.from_pretrained(PROCESSOR_PATH)\n","\n","    # Optional decoder\n","    decoder = _build_decoder(model, processor)\n","\n","    # Load audio\n","    wav = _load_audio(AUDIO_FILE, TARGET_SR)\n","\n","    # Transcribe by chunks\n","    pieces = []\n","    for chunk in _chunks(wav, TARGET_SR, CHUNK_SEC, OVERLAP_SEC):\n","        # processor for Wav2Vec2Bert expects raw audio -> input_features\n","        inputs = processor(chunk.squeeze().numpy(), sampling_rate=TARGET_SR, return_tensors=\"pt\").to(device)\n","        with torch.no_grad():\n","            logits = model(input_features=inputs.input_features).logits  # [1, T, V]\n","        logp = logits[0].cpu().numpy()\n","\n","        if decoder is not None:\n","            hypo = decoder.decode(logp, beam_width=BEAM_WIDTH)\n","        else:\n","            # Greedy fallback if pyctcdecode not available\n","            ids = logp.argmax(axis=-1)\n","            tokens = processor.tokenizer.convert_ids_to_tokens(ids.tolist())\n","            hypo = \"\".join(tokens)\n","\n","        if hypo.strip():\n","            pieces.append(hypo.strip())\n","\n","        # cleanup per chunk\n","        del inputs, logits, logp\n","\n","    text = _postprocess(\" \".join(pieces))\n","    return text\n","\n","if __name__ == \"__main__\":\n","    out = transcribe_one_file()\n","    if OUTPUT_TXT:\n","        os.makedirs(os.path.dirname(OUTPUT_TXT), exist_ok=True)\n","        with open(OUTPUT_TXT, \"w\", encoding=\"utf-8\") as f:\n","            f.write(out + \"\\n\")\n","    print(out)\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"W1rQvavueaBI","executionInfo":{"status":"ok","timestamp":1756686391018,"user_tz":240,"elapsed":15969,"user":{"displayName":"Beshir Ibrahim","userId":"16736839346810179639"}},"outputId":"8c5a1a6a-dd57-4b82-f891-0f4e45945a93"},"execution_count":8,"outputs":[{"output_type":"stream","name":"stdout","text":["ሕርጊጎ ምነ ምን ዘበን አትራክ እንዴ አንበተት እብ መረባቤዐ ግሩም ለትሐሌ መዲነት ተ.\n"]}]},{"cell_type":"code","source":[],"metadata":{"id":"qs6x1lHOlthS"},"execution_count":null,"outputs":[]}]}