maliahson commited on
Commit
bbe1a26
·
verified ·
1 Parent(s): 6f13335

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -8
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import gradio as gr
2
  import torch
3
- from transformers import pipeline
4
  from jiwer import wer
5
 
6
  # Load models
@@ -15,12 +15,25 @@ whisper_pipeline_2 = pipeline(
15
  device=0 if torch.cuda.is_available() else "cpu"
16
  )
17
 
18
- whisper_pipeline_3 = pipeline(
19
- "automatic-speech-recognition",
20
- model="openai/whisper-medium",
21
- device=0 if torch.cuda.is_available() else "cpu",
22
- model_kwargs={"language": "<|ur|>"}
23
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  def transcribe_and_compare(audio_path, original_transcription=None):
26
  """
@@ -35,7 +48,7 @@ def transcribe_and_compare(audio_path, original_transcription=None):
35
  # Transcriptions from all three models
36
  transcription_1 = whisper_pipeline_1(audio_path)["text"]
37
  transcription_2 = whisper_pipeline_2(audio_path)["text"]
38
- transcription_3 = whisper_pipeline_3(audio_path)["text"]
39
 
40
  # Prepare comparison results
41
  comparison_result = {
 
1
  import gradio as gr
2
  import torch
3
+ from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
4
  from jiwer import wer
5
 
6
  # Load models
 
15
  device=0 if torch.cuda.is_available() else "cpu"
16
  )
17
 
18
+ # Set up openai/whisper-medium for Urdu transcription
19
+ model_3 = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-medium")
20
+ processor_3 = AutoProcessor.from_pretrained("openai/whisper-medium")
21
+
22
+ # Set forced decoder IDs for Urdu
23
+ language = "<|ur|>" # Urdu language token
24
+ task = "<|transcribe|>" # Transcription task token
25
+ forced_decoder_ids = processor_3.get_decoder_prompt_ids(language=language, task=task)
26
+
27
+ def transcribe_with_whisper_medium(audio_path):
28
+ """
29
+ Transcribe audio using the openai/whisper-medium model with forced language settings for Urdu.
30
+ """
31
+ inputs = processor_3(audio_path, return_tensors="pt", sampling_rate=16000)
32
+ with torch.no_grad():
33
+ outputs = model_3.generate(
34
+ inputs["input_features"], forced_decoder_ids=forced_decoder_ids
35
+ )
36
+ return processor_3.batch_decode(outputs, skip_special_tokens=True)[0]
37
 
38
  def transcribe_and_compare(audio_path, original_transcription=None):
39
  """
 
48
  # Transcriptions from all three models
49
  transcription_1 = whisper_pipeline_1(audio_path)["text"]
50
  transcription_2 = whisper_pipeline_2(audio_path)["text"]
51
+ transcription_3 = transcribe_with_whisper_medium(audio_path)
52
 
53
  # Prepare comparison results
54
  comparison_result = {