Upload 9 files

Browse files

Files changed (10) hide show

.gitattributes +1 -0
added_tokens.json +6 -0
config.json +81 -0
preprocessor_config.json +11 -0
sample.wav +3 -0
special_tokens_map.json +6 -0
tig_lm.bin +3 -0
tokenizer_config.json +65 -0
transcribe.ipynb +1 -0
vocab.json +199 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+sample.wav filter=lfs diff=lfs merge=lfs -text

added_tokens.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "</s>": 198,
+  "<pad>": 200,
+  "<s>": 197,
+  "<unk>": 199
+}

config.json ADDED Viewed

	@@ -0,0 +1,81 @@

+{
+  "activation_dropout": 0.0,
+  "adapter_act": "relu",
+  "adapter_kernel_size": 3,
+  "adapter_stride": 2,
+  "add_adapter": true,
+  "apply_spec_augment": false,
+  "architectures": [
+    "Wav2Vec2BertForCTC"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "classifier_proj_size": 768,
+  "codevector_dim": 768,
+  "conformer_conv_dropout": 0.1,
+  "contrastive_logits_temperature": 0.1,
+  "conv_depthwise_kernel_size": 31,
+  "ctc_loss_reduction": "mean",
+  "ctc_zero_infinity": false,
+  "diversity_loss_weight": 0.1,
+  "eos_token_id": 2,
+  "feat_proj_dropout": 0.0,
+  "feat_quantizer_dropout": 0.0,
+  "feature_projection_input_dim": 160,
+  "final_dropout": 0.1,
+  "hidden_act": "swish",
+  "hidden_dropout": 0.0,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.0,
+  "left_max_position_embeddings": 64,
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.0,
+  "max_source_positions": 5000,
+  "model_type": "wav2vec2-bert",
+  "num_adapter_layers": 1,
+  "num_attention_heads": 16,
+  "num_codevector_groups": 2,
+  "num_codevectors_per_group": 320,
+  "num_hidden_layers": 24,
+  "num_negatives": 100,
+  "output_hidden_size": 1024,
+  "pad_token_id": 196,
+  "position_embeddings_type": "relative_key",
+  "proj_codevector_dim": 768,
+  "right_max_position_embeddings": 8,
+  "rotary_embedding_base": 10000,
+  "tdnn_dilation": [
+    1,
+    2,
+    3,
+    1,
+    1
+  ],
+  "tdnn_dim": [
+    512,
+    512,
+    512,
+    512,
+    1500
+  ],
+  "tdnn_kernel": [
+    5,
+    3,
+    3,
+    1,
+    1
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.55.4",
+  "use_intermediate_ffn_before_adapter": false,
+  "use_weighted_layer_sum": false,
+  "vocab_size": 199,
+  "xvector_output_dim": 512
+}

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "feature_extractor_type": "SeamlessM4TFeatureExtractor",
+  "feature_size": 80,
+  "num_mel_bins": 80,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "processor_class": "Wav2Vec2BertProcessor",
+  "return_attention_mask": true,
+  "sampling_rate": 16000,
+  "stride": 2
+}

sample.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:db8e88d68185893ce0bc409cb3ee5b513f8f56b3be02d13d5a5157a34fd70ea1
+size 668204

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

tig_lm.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:80749043e27b3f75460846e4ce27f83247ab5d82310862b0ef5599ff32bfa5ef
+size 21659616

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,65 @@

+{
+  "added_tokens_decoder": {
+    "195": {
+      "content": "[UNK]",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "196": {
+      "content": "[PAD]",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "197": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "198": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "199": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "do_lower_case": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "processor_class": "Wav2Vec2BertProcessor",
+  "replace_word_delimiter_char": " ",
+  "target_lang": null,
+  "tokenizer_class": "Wav2Vec2CTCTokenizer",
+  "unk_token": "<unk>",
+  "word_delimiter_token": "|"
+}

transcribe.ipynb ADDED Viewed

	@@ -0,0 +1 @@

+ {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"machine_shape":"hm","mount_file_id":"15JwXGAHSNDvOfhIDFaj8h2bt5FijajiD","authorship_tag":"ABX9TyOIpJiulD+u85Vw+4eh9A8m"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","source":[],"metadata":{"id":"Pj64tkijY4tT","executionInfo":{"status":"ok","timestamp":1756685653771,"user_tz":240,"elapsed":5,"user":{"displayName":"Beshir Ibrahim","userId":"16736839346810179639"}}},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["%%capture\n","# Core libraries\n","!pip install torch torchaudio transformers pydub numpy pyctcdecode\n","# If you need mp3 input support\n","!sudo apt-get update -qq\n","!sudo apt-get install -y ffmpeg\n","# For KenLM ARPA/bin support\n","!pip install https://github.com/kpu/kenlm/archive/master.zip"],"metadata":{"id":"d6IIQn8_hEAy","executionInfo":{"status":"ok","timestamp":1756686608615,"user_tz":240,"elapsed":56491,"user":{"displayName":"Beshir Ibrahim","userId":"16736839346810179639"}}},"execution_count":9,"outputs":[]},{"cell_type":"code","source":["MODEL_PATH = \"/content/drive/MyDrive/artifacts/models/hf/hf_tgt/tigre-asr-Wav2Vec2Bert\" # model and processor path\n","PROCESSOR_PATH = MODEL_PATH\n","AUDIO_FILE = MODEL_PATH+\"/sample.wav\"\n","OUTPUT_TXT = None # e.g., \"/path/to/out.txt\" or None to just print\n","# KenLM + lexicon (optional but recommended for beam search)\n","KENLM_ARPA = MODEL_PATH+\"/lm.arpa\" # set to None to decode WITHOUT LM\n","LEXICON_TXT = MODEL_PATH+\"/lexicon.txt\" # used to load unigrams; set to None if not available"],"metadata":{"id":"B81FMlsQlSOh","executionInfo":{"status":"ok","timestamp":1756686366477,"user_tz":240,"elapsed":13,"user":{"displayName":"Beshir Ibrahim","userId":"16736839346810179639"}}},"execution_count":7,"outputs":[]},{"cell_type":"code","source":["import warnings\n","import logging\n","\n","# Silence all Python warnings\n","warnings.filterwarnings(\"ignore\")\n","# Silence pyctcdecode logger\n","logging.getLogger(\"pyctcdecode\").setLevel(logging.ERROR)\n","# Silence torchaudio warnings (optionally all)\n","logging.getLogger(\"torchaudio\").setLevel(logging.ERROR)"],"metadata":{"id":"Y90co7BOmK9n","executionInfo":{"status":"ok","timestamp":1756685692591,"user_tz":240,"elapsed":6,"user":{"displayName":"Beshir Ibrahim","userId":"16736839346810179639"}}},"execution_count":2,"outputs":[]},{"cell_type":"code","source":["# Audio / chunking\n","TARGET_SR = 16000\n","CHUNK_SEC = 5 # chunk length in seconds\n","OVERLAP_SEC = 0 # overlap between chunks in seconds (0 for minimal code)\n","# Beam search params\n","BEAM_WIDTH = 150\n","LM_ALPHA = 0.5\n","LM_BETA = 1.0"],"metadata":{"id":"7DOmsFxbnzwK","executionInfo":{"status":"ok","timestamp":1756685693508,"user_tz":240,"elapsed":5,"user":{"displayName":"Beshir Ibrahim","userId":"16736839346810179639"}}},"execution_count":3,"outputs":[]},{"cell_type":"code","source":["import os\n","import torch\n","import numpy as np\n","import torchaudio\n","from typing import List, Optional\n","\n","# Use pydub for robust mp3 handling\n","from pydub import AudioSegment\n","\n","from transformers import Wav2Vec2BertForCTC, Wav2Vec2BertProcessor\n","\n","# Optional LM decoding\n","try:\n"," from pyctcdecode import build_ctcdecoder\n"," _HAS_PYCTC = True\n","except Exception:\n"," _HAS_PYCTC = False\n","\n","# Pick device\n","device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n","\n","def _load_audio(path: str, target_sr: int = 16000) -> torch.Tensor:\n"," \"\"\"Load WAV or MP3 to mono float32 tensor [1, T] at target_sr.\"\"\"\n"," ext = os.path.splitext(path)[1].lower()\n"," if ext == \".mp3\":\n"," audio = AudioSegment.from_file(path, format=\"mp3\")\n"," audio = audio.set_channels(1).set_frame_rate(target_sr)\n"," samples = np.array(audio.get_array_of_samples()).astype(np.float32)\n"," # pydub gives int PCM range; normalize if needed (assume 16-bit)\n"," if samples.dtype != np.float32:\n"," samples = samples.astype(np.float32)\n"," # If sample_width==2 (16-bit), divide by 32768\n"," if audio.sample_width == 2:\n"," samples /= 32768.0\n"," return torch.from_numpy(samples).unsqueeze(0)\n"," else:\n"," wav, sr = torchaudio.load(path)\n"," if wav.shape[0] > 1:\n"," wav = wav.mean(dim=0, keepdim=True) # stereo -> mono\n"," if sr != target_sr:\n"," wav = torchaudio.transforms.Resample(sr, target_sr)(wav)\n"," # ensure float32 in [-1,1]\n"," if wav.dtype != torch.float32:\n"," wav = wav.to(torch.float32)\n"," return wav\n","\n","def _chunks(wave: torch.Tensor, sr: int, chunk_sec: int, overlap_sec: int):\n"," \"\"\"Yield possibly-overlapping chunks [1, T_chunk].\"\"\"\n"," chunk = int(chunk_sec * sr)\n"," step = max(1, chunk - int(overlap_sec * sr))\n"," T = wave.size(-1)\n"," for start in range(0, T, step):\n"," end = min(start + chunk, T)\n"," yield wave[:, start:end]\n"," if end >= T:\n"," break\n","\n","def _load_unigrams(lexicon_path: Optional[str]) -> List[str]:\n"," \"\"\"Read first token per line from lexicon into a unigram list.\"\"\"\n"," if not lexicon_path or not os.path.exists(lexicon_path):\n"," return []\n"," words = set()\n"," with open(lexicon_path, \"r\", encoding=\"utf-8\") as f:\n"," for line in f:\n"," w = line.strip().split()\n"," if w:\n"," words.add(w[0])\n"," return sorted(words)\n","\n","def _build_decoder(model, processor):\n"," \"\"\"Build a pyctcdecode decoder from model vocab + KenLM (if configured).\"\"\"\n"," # Build vocab (id -> token)\n"," vocab_size = model.lm_head.out_features\n"," labels = []\n"," for i in range(vocab_size):\n"," tok = processor.tokenizer.convert_ids_to_tokens([i])[0]\n"," # remove common BPE markers\n"," tok = tok.lstrip(\"Ġ\").lstrip(\"▁\")\n"," labels.append(tok)\n","\n"," # No LM? Use labels only; with LM? also pass unigrams + alpha/beta\n"," if not _HAS_PYCTC:\n"," return None\n","\n"," if KENLM_ARPA and os.path.exists(KENLM_ARPA):\n"," unigrams = _load_unigrams(LEXICON_TXT)\n"," return build_ctcdecoder(\n"," labels=labels,\n"," kenlm_model_path=KENLM_ARPA,\n"," unigrams=unigrams if unigrams else None,\n"," alpha=LM_ALPHA,\n"," beta=LM_BETA\n"," )\n"," else:\n"," # Fallback to lexicon-less decoder (greedy-ish beam without LM)\n"," return build_ctcdecoder(labels=labels)\n","\n","def _postprocess(text: str) -> str:\n"," \"\"\"Light cleanup: strip special markers, collapse dup words, ensure end punctuation.\"\"\"\n"," text = text.replace(\"<|\", \"\").replace(\"|>\", \"\").replace(\"<>\", \"\").strip()\n"," words, cleaned = text.split(), []\n"," for w in words:\n"," if not cleaned or cleaned[-1] != w:\n"," cleaned.append(w)\n"," out = \" \".join(cleaned).strip()\n"," if out and out[-1] not in \".!?\":\n"," out += \".\"\n"," return out\n","\n","def transcribe_one_file() -> str:\n"," # Load model + processor\n"," model = Wav2Vec2BertForCTC.from_pretrained(MODEL_PATH).to(device).eval()\n"," processor = Wav2Vec2BertProcessor.from_pretrained(PROCESSOR_PATH)\n","\n"," # Optional decoder\n"," decoder = _build_decoder(model, processor)\n","\n"," # Load audio\n"," wav = _load_audio(AUDIO_FILE, TARGET_SR)\n","\n"," # Transcribe by chunks\n"," pieces = []\n"," for chunk in _chunks(wav, TARGET_SR, CHUNK_SEC, OVERLAP_SEC):\n"," # processor for Wav2Vec2Bert expects raw audio -> input_features\n"," inputs = processor(chunk.squeeze().numpy(), sampling_rate=TARGET_SR, return_tensors=\"pt\").to(device)\n"," with torch.no_grad():\n"," logits = model(input_features=inputs.input_features).logits # [1, T, V]\n"," logp = logits[0].cpu().numpy()\n","\n"," if decoder is not None:\n"," hypo = decoder.decode(logp, beam_width=BEAM_WIDTH)\n"," else:\n"," # Greedy fallback if pyctcdecode not available\n"," ids = logp.argmax(axis=-1)\n"," tokens = processor.tokenizer.convert_ids_to_tokens(ids.tolist())\n"," hypo = \"\".join(tokens)\n","\n"," if hypo.strip():\n"," pieces.append(hypo.strip())\n","\n"," # cleanup per chunk\n"," del inputs, logits, logp\n","\n"," text = _postprocess(\" \".join(pieces))\n"," return text\n","\n","if __name__ == \"__main__\":\n"," out = transcribe_one_file()\n"," if OUTPUT_TXT:\n"," os.makedirs(os.path.dirname(OUTPUT_TXT), exist_ok=True)\n"," with open(OUTPUT_TXT, \"w\", encoding=\"utf-8\") as f:\n"," f.write(out + \"\\n\")\n"," print(out)\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"W1rQvavueaBI","executionInfo":{"status":"ok","timestamp":1756686391018,"user_tz":240,"elapsed":15969,"user":{"displayName":"Beshir Ibrahim","userId":"16736839346810179639"}},"outputId":"8c5a1a6a-dd57-4b82-f891-0f4e45945a93"},"execution_count":8,"outputs":[{"output_type":"stream","name":"stdout","text":["ሕርጊጎ ምነ ምን ዘበን አትራክ እንዴ አንበተት እብ መረባቤዐ ግሩም ለትሐሌ መዲነት ተ.\n"]}]},{"cell_type":"code","source":[],"metadata":{"id":"qs6x1lHOlthS"},"execution_count":null,"outputs":[]}]}

vocab.json ADDED Viewed

	@@ -0,0 +1,199 @@

+{
+  "<": 1,
+  ">": 2,
+  "[PAD]": 196,
+  "[UNK]": 195,
+  "|": 0,
+  "ሀ": 3,
+  "ሁ": 4,
+  "ሂ": 5,
+  "ሃ": 6,
+  "ሄ": 7,
+  "ህ": 8,
+  "ሆ": 9,
+  "ለ": 10,
+  "ሉ": 11,
+  "ሊ": 12,
+  "ላ": 13,
+  "ሌ": 14,
+  "ል": 15,
+  "ሎ": 16,
+  "ሐ": 17,
+  "ሑ": 18,
+  "ሒ": 19,
+  "ሓ": 20,
+  "ሔ": 21,
+  "ሕ": 22,
+  "ሖ": 23,
+  "መ": 24,
+  "ሙ": 25,
+  "ሚ": 26,
+  "ማ": 27,
+  "ሜ": 28,
+  "ም": 29,
+  "ሞ": 30,
+  "ሣ": 31,
+  "ሥ": 32,
+  "ረ": 33,
+  "ሩ": 34,
+  "ሪ": 35,
+  "ራ": 36,
+  "ሬ": 37,
+  "ር": 38,
+  "ሮ": 39,
+  "ሰ": 40,
+  "ሱ": 41,
+  "ሲ": 42,
+  "ሳ": 43,
+  "ሴ": 44,
+  "ስ": 45,
+  "ሶ": 46,
+  "ሸ": 47,
+  "ሹ": 48,
+  "ሺ": 49,
+  "ሻ": 50,
+  "ሼ": 51,
+  "ሽ": 52,
+  "ሾ": 53,
+  "ቀ": 54,
+  "ቁ": 55,
+  "ቂ": 56,
+  "ቃ": 57,
+  "ቄ": 58,
+  "ቅ": 59,
+  "ቆ": 60,
+  "ቈ": 61,
+  "ቍ": 62,
+  "ቐ": 63,
+  "ቑ": 64,
+  "ቒ": 65,
+  "ቓ": 66,
+  "ቕ": 67,
+  "ቖ": 68,
+  "ቛ": 69,
+  "በ": 70,
+  "ቡ": 71,
+  "ቢ": 72,
+  "ባ": 73,
+  "ቤ": 74,
+  "ብ": 75,
+  "ቦ": 76,
+  "ተ": 77,
+  "ቱ": 78,
+  "ቲ": 79,
+  "ታ": 80,
+  "ቴ": 81,
+  "ት": 82,
+  "ቶ": 83,
+  "ቹ": 84,
+  "ቺ": 85,
+  "ች": 86,
+  "ነ": 87,
+  "ኑ": 88,
+  "ኒ": 89,
+  "ና": 90,
+  "ኔ": 91,
+  "ን": 92,
+  "ኖ": 93,
+  "ኛ": 94,
+  "አ": 95,
+  "ኡ": 96,
+  "ኢ": 97,
+  "ኣ": 98,
+  "ኤ": 99,
+  "እ": 100,
+  "ኦ": 101,
+  "ከ": 102,
+  "ኩ": 103,
+  "ኪ": 104,
+  "ካ": 105,
+  "ኬ": 106,
+  "ክ": 107,
+  "ኮ": 108,
+  "ኰ": 109,
+  "ኳ": 110,
+  "ኸ": 111,
+  "ኺ": 112,
+  "ኻ": 113,
+  "ኽ": 114,
+  "ኾ": 115,
+  "ወ": 116,
+  "ዉ": 117,
+  "ዊ": 118,
+  "ዋ": 119,
+  "ዌ": 120,
+  "ው": 121,
+  "ዎ": 122,
+  "ዐ": 123,
+  "ዑ": 124,
+  "ዒ": 125,
+  "ዓ": 126,
+  "ዕ": 127,
+  "ዖ": 128,
+  "ዘ": 129,
+  "ዙ": 130,
+  "ዚ": 131,
+  "ዛ": 132,
+  "ዜ": 133,
+  "ዝ": 134,
+  "ዞ": 135,
+  "የ": 136,
+  "ዩ": 137,
+  "ዪ": 138,
+  "ያ": 139,
+  "ይ": 140,
+  "ዮ": 141,
+  "ደ": 142,
+  "ዱ": 143,
+  "ዲ": 144,
+  "ዳ": 145,
+  "ዴ": 146,
+  "ድ": 147,
+  "ዶ": 148,
+  "ጀ": 149,
+  "ጃ": 150,
+  "ጅ": 151,
+  "ገ": 152,
+  "ጉ": 153,
+  "ጊ": 154,
+  "ጋ": 155,
+  "ጌ": 156,
+  "ግ": 157,
+  "ጎ": 158,
+  "ጐ": 159,
+  "ጓ": 160,
+  "ጠ": 161,
+  "ጡ": 162,
+  "ጢ": 163,
+  "ጣ": 164,
+  "ጤ": 165,
+  "ጥ": 166,
+  "ጦ": 167,
+  "ጨ": 168,
+  "ጩ": 169,
+  "ጪ": 170,
+  "ጫ": 171,
+  "ጭ": 172,
+  "ጮ": 173,
+  "ጳ": 174,
+  "ጵ": 175,
+  "ጸ": 176,
+  "ጹ": 177,
+  "ጺ": 178,
+  "ጻ": 179,
+  "ጼ": 180,
+  "ጽ": 181,
+  "ጾ": 182,
+  "ፀ": 183,
+  "ፅ": 184,
+  "ፆ": 185,
+  "ፈ": 186,
+  "ፉ": 187,
+  "ፊ": 188,
+  "ፋ": 189,
+  "ፌ": 190,
+  "ፍ": 191,
+  "ፎ": 192,
+  "ፑ": 193,
+  "ፔ": 194
+}