maliahson's picture
Update app.py
1cdc427 verified
import gradio as gr
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
from jiwer import wer
# Load models
whisper_pipeline_1 = pipeline(
"automatic-speech-recognition",
model="maliahson/Finetuned_Whisper_Medium_Model_2"
)
whisper_pipeline_2 = pipeline(
"automatic-speech-recognition",
model="openai/whisper-large-v3-turbo",
device=0 if torch.cuda.is_available() else "cpu"
)
# Set up openai/whisper-medium for Urdu transcription
processor = WhisperProcessor.from_pretrained("openai/whisper-medium")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium")
forced_decoder_ids = processor.get_decoder_prompt_ids(language="urdu", task="transcribe")
def transcribe_with_whisper_medium(audio_path):
"""
Transcribe audio using the openai/whisper-medium model with forced language settings for Urdu.
"""
inputs = processor(audio_path, return_tensors="pt", sampling_rate=16000)
with torch.no_grad():
# Generate the transcription using the forced decoder IDs for Urdu
outputs = model.generate(
inputs["input_features"], forced_decoder_ids=forced_decoder_ids
)
# Decode the outputs to text
return processor.batch_decode(outputs, skip_special_tokens=True)[0]
def transcribe_and_compare(audio_path, original_transcription=None):
"""
Transcribes an audio file using three Whisper models and compares results.
Args:
audio_path (str): Path to the audio file.
original_transcription (str, optional): Ground truth transcription.
Returns:
dict: Results including transcriptions and WER calculations.
"""
# Transcriptions from all three models
transcription_1 = whisper_pipeline_1(audio_path)["text"]
transcription_2 = whisper_pipeline_2(audio_path)["text"]
transcription_3 = transcribe_with_whisper_medium(audio_path)
# Prepare comparison results
comparison_result = {
"Model 1 Output (maliahson/Finetuned_Whisper_Medium_Model_2)": transcription_1,
"Model 2 Output (openai/whisper-large-v3-turbo)": transcription_2,
"Model 3 Output (Openai/whisper-medium, Urdu)": transcription_3
}
if original_transcription:
# Calculate Word Error Rate (WER) for all models
wer_1 = wer(original_transcription, transcription_1)
wer_2 = wer(original_transcription, transcription_2)
wer_3 = wer(original_transcription, transcription_3)
# Add WER scores to results
comparison_result["WER Model 1"] = wer_1
comparison_result["WER Model 2"] = wer_2
comparison_result["WER Model 3"] = wer_3
else:
# Compare outputs of all three models when no ground truth is provided
comparison_result["Difference Between Models"] = {
"Model 1 Unique Words": set(transcription_1.split()) - set(transcription_2.split()) - set(transcription_3.split()),
"Model 2 Unique Words": set(transcription_2.split()) - set(transcription_1.split()) - set(transcription_3.split()),
"Model 3 Unique Words": set(transcription_3.split()) - set(transcription_1.split()) - set(transcription_2.split()),
}
return comparison_result
# Gradio Interface
with gr.Blocks() as demo:
gr.Markdown("## Audio Transcription and Comparison")
audio_input = gr.Audio(type="filepath", label="Upload or Record Audio")
original_transcription = gr.Textbox(lines=2, label="Original Transcription (Optional)")
output = gr.JSON(label="Comparison Results")
submit_btn = gr.Button("Transcribe and Compare")
submit_btn.click(
transcribe_and_compare,
inputs=[audio_input, original_transcription],
outputs=output
)
demo.launch(debug=True)