Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

README.md +173 -0
config.json +269 -0
model.bin +3 -0
tokenizer.json +0 -0
vocabulary.json +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,173 @@

+---
+language:
+- en
+- zh
+- de
+- es
+- ru
+- ko
+- fr
+- ja
+- pt
+- tr
+- pl
+- ca
+- nl
+- ar
+- sv
+- it
+- id
+- hi
+- fi
+- vi
+- he
+- uk
+- el
+- ms
+- cs
+- ro
+- da
+- hu
+- ta
+- no
+- th
+- ur
+- hr
+- bg
+- lt
+- la
+- mi
+- ml
+- cy
+- sk
+- te
+- fa
+- lv
+- bn
+- sr
+- az
+- sl
+- kn
+- et
+- mk
+- br
+- eu
+- is
+- hy
+- ne
+- mn
+- bs
+- kk
+- sq
+- sw
+- gl
+- mr
+- pa
+- si
+- km
+- sn
+- yo
+- so
+- af
+- oc
+- ka
+- be
+- tg
+- sd
+- gu
+- am
+- yi
+- lo
+- uz
+- fo
+- ht
+- ps
+- tk
+- nn
+- mt
+- sa
+- lb
+- my
+- bo
+- tl
+- mg
+- as
+- tt
+- haw
+- ln
+- ha
+- ba
+- jw
+- su
+tags:
+- audio
+- automatic-speech-recognition
+license: mit
+base_model:
+- openai/whisper-large-v2
+pipeline_tag: automatic-speech-recognition
+---
+# Den4ikAI/whisper-large-v2-no-digits-norm-punct
+This is a special version of the `openai/whisper-large-v2` model whose vocabulary has had all tokens corresponding to digits removed, as well as tokens with extraneous punctuation.
+The primary goal of this modification is to **force the model to generate numbers as words rather than digits**. This is extremely useful for text normalization tasks, for example when preparing data for text-to-speech (TTS) systems, where numbers need to be fully spelled out.
+## Comparison with the Original Model
+Here’s a clear example demonstrating the difference in behavior between the models when transcribing the same audio clip containing the phrase “Билет стоил двадцать тысяч рублей” (“The ticket cost twenty thousand rubles”).
+| Model                                                       | Transcription Output                                                                                   |
+| ----------------------------------------------------------- | ------------------------------------------------------------------------------------------------------ |
+| `openai/whisper-large-v2` (Original)                        | `<\|startoftranscript\|><\|ru\|><\|transcribe\|><\|notimestamps\|> Билет стоил **20000** рублей.<\|endoftext\|>` |
+| `Den4ikAI/whisper-large-v2-no-digits-norm-punct` (This model) | `<\|startoftranscript\|><\|ru\|><\|transcribe\|><\|notimestamps\|> Билет стоил **двадцать тысяч** рублей.<\|endoftext\|>` |
+As you can see, this modified model correctly normalized the number into words, whereas the original version left it as digits.
+## How to Use
+You can use this model just like any other Whisper model in the `transformers` library.
+```python
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
+import torchaudio
+import torch
+# Specify the device (GPU if available)
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+# Load the audio file
+wav, sr = torchaudio.load("numbers5.mp3")
+# Convert to mono and resample to 16 kHz
+if wav.shape[0] > 1:
+    wav = torch.mean(wav, dim=0, keepdim=True)
+resampler = torchaudio.transforms.Resample(sr, 16000)
+wav = resampler(wav)
+audio_input = wav.squeeze(0)
+# Load the processor and model
+model_id = "Den4ikAI/whisper-large-v2-no-digits-norm-punct"
+processor = WhisperProcessor.from_pretrained(model_id)
+model = WhisperForConditionalGeneration.from_pretrained(model_id).to(device)
+# Prepare inputs and extract features
+input_features = processor(
+    audio_input,
+    sampling_rate=16000,
+    return_tensors="pt"
+).input_features.to(device)
+# Generate token IDs (for Russian specify language="russian")
+predicted_ids = model.generate(input_features, language="russian", task="transcribe")
+# Decode tokens back to text
+transcription = processor.batch_decode(
+    predicted_ids,
+    skip_special_tokens=False
+)
+print(transcription)
+# Example output for an audio clip with numbers:
+# ['<|startoftranscript|><|ru|><|transcribe|><|notimestamps|> Билет стоил двадцать тысяч рублей.<|endoftext|>']

config.json ADDED Viewed

	@@ -0,0 +1,269 @@

+{
+  "alignment_heads": [
+    [
+      10,
+      12
+    ],
+    [
+      13,
+      17
+    ],
+    [
+      16,
+      11
+    ],
+    [
+      16,
+      12
+    ],
+    [
+      16,
+      13
+    ],
+    [
+      17,
+      15
+    ],
+    [
+      17,
+      16
+    ],
+    [
+      18,
+      4
+    ],
+    [
+      18,
+      11
+    ],
+    [
+      18,
+      19
+    ],
+    [
+      19,
+      11
+    ],
+    [
+      21,
+      2
+    ],
+    [
+      21,
+      3
+    ],
+    [
+      22,
+      3
+    ],
+    [
+      22,
+      9
+    ],
+    [
+      22,
+      12
+    ],
+    [
+      23,
+      5
+    ],
+    [
+      23,
+      7
+    ],
+    [
+      23,
+      13
+    ],
+    [
+      25,
+      5
+    ],
+    [
+      26,
+      1
+    ],
+    [
+      26,
+      12
+    ],
+    [
+      27,
+      15
+    ]
+  ],
+  "lang_ids": [
+    49641,
+    49642,
+    49643,
+    49644,
+    49645,
+    49646,
+    49647,
+    49648,
+    49649,
+    49650,
+    49651,
+    49652,
+    49653,
+    49654,
+    49655,
+    49656,
+    49657,
+    49658,
+    49659,
+    49660,
+    49661,
+    49662,
+    49663,
+    49664,
+    49665,
+    49666,
+    49667,
+    49668,
+    49669,
+    49670,
+    49671,
+    49672,
+    49673,
+    49674,
+    49675,
+    49676,
+    49677,
+    49678,
+    49679,
+    49680,
+    49681,
+    49682,
+    49683,
+    49684,
+    49685,
+    49686,
+    49687,
+    49688,
+    49689,
+    49690,
+    49691,
+    49692,
+    49693,
+    49694,
+    49695,
+    49696,
+    49697,
+    49698,
+    49699,
+    49700,
+    49701,
+    49702,
+    49703,
+    49704,
+    49705,
+    49706,
+    49707,
+    49708,
+    49709,
+    49710,
+    49711,
+    49712,
+    49713,
+    49714,
+    49715,
+    49716,
+    49717,
+    49718,
+    49719,
+    49720,
+    49721,
+    49722,
+    49723,
+    49724,
+    49725,
+    49726,
+    49727,
+    49728,
+    49729,
+    49730,
+    49731,
+    49732,
+    49733,
+    49734,
+    49735,
+    49736,
+    49737,
+    49738,
+    49739
+  ],
+  "suppress_ids": [
+    1,
+    3,
+    4,
+    8,
+    9,
+    324,
+    467,
+    486,
+    506,
+    834,
+    862,
+    878,
+    882,
+    891,
+    1305,
+    1801,
+    1929,
+    2400,
+    2566,
+    3178,
+    3185,
+    3200,
+    3461,
+    3768,
+    3883,
+    4103,
+    4580,
+    6473,
+    6533,
+    7148,
+    8901,
+    10253,
+    10747,
+    11742,
+    11835,
+    12130,
+    12359,
+    13567,
+    13924,
+    14396,
+    15019,
+    15372,
+    16292,
+    16343,
+    18081,
+    18670,
+    21357,
+    22189,
+    25749,
+    25780,
+    26052,
+    27878,
+    31206,
+    31852,
+    32019,
+    36364,
+    42314,
+    46827,
+    49255,
+    49636,
+    49640,
+    49740,
+    49741,
+    49742,
+    49743,
+    49744
+  ],
+  "suppress_ids_begin": [
+    186,
+    49639
+  ]
+}

model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:767e94f2ec812dbf97116aca47dd2145f348726b9d869be8d8e9d5ade920380f
+size 3085330957

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

vocabulary.json ADDED Viewed

The diff for this file is too large to render. See raw diff