Spaces:
Runtime error
Runtime error
import gradio as gr | |
import torch | |
from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline | |
from jiwer import wer | |
# Load models | |
whisper_pipeline_1 = pipeline( | |
"automatic-speech-recognition", | |
model="maliahson/Finetuned_Whisper_Medium_Model_2" | |
) | |
whisper_pipeline_2 = pipeline( | |
"automatic-speech-recognition", | |
model="openai/whisper-large-v3-turbo", | |
device=0 if torch.cuda.is_available() else "cpu" | |
) | |
# Set up openai/whisper-medium for Urdu transcription | |
processor = WhisperProcessor.from_pretrained("openai/whisper-medium") | |
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium") | |
forced_decoder_ids = processor.get_decoder_prompt_ids(language="urdu", task="transcribe") | |
def transcribe_with_whisper_medium(audio_path): | |
""" | |
Transcribe audio using the openai/whisper-medium model with forced language settings for Urdu. | |
""" | |
inputs = processor(audio_path, return_tensors="pt", sampling_rate=16000) | |
with torch.no_grad(): | |
# Generate the transcription using the forced decoder IDs for Urdu | |
outputs = model.generate( | |
inputs["input_features"], forced_decoder_ids=forced_decoder_ids | |
) | |
# Decode the outputs to text | |
return processor.batch_decode(outputs, skip_special_tokens=True)[0] | |
def transcribe_and_compare(audio_path, original_transcription=None): | |
""" | |
Transcribes an audio file using three Whisper models and compares results. | |
Args: | |
audio_path (str): Path to the audio file. | |
original_transcription (str, optional): Ground truth transcription. | |
Returns: | |
dict: Results including transcriptions and WER calculations. | |
""" | |
# Transcriptions from all three models | |
transcription_1 = whisper_pipeline_1(audio_path)["text"] | |
transcription_2 = whisper_pipeline_2(audio_path)["text"] | |
transcription_3 = transcribe_with_whisper_medium(audio_path) | |
# Prepare comparison results | |
comparison_result = { | |
"Model 1 Output (maliahson/Finetuned_Whisper_Medium_Model_2)": transcription_1, | |
"Model 2 Output (openai/whisper-large-v3-turbo)": transcription_2, | |
"Model 3 Output (Openai/whisper-medium, Urdu)": transcription_3 | |
} | |
if original_transcription: | |
# Calculate Word Error Rate (WER) for all models | |
wer_1 = wer(original_transcription, transcription_1) | |
wer_2 = wer(original_transcription, transcription_2) | |
wer_3 = wer(original_transcription, transcription_3) | |
# Add WER scores to results | |
comparison_result["WER Model 1"] = wer_1 | |
comparison_result["WER Model 2"] = wer_2 | |
comparison_result["WER Model 3"] = wer_3 | |
else: | |
# Compare outputs of all three models when no ground truth is provided | |
comparison_result["Difference Between Models"] = { | |
"Model 1 Unique Words": set(transcription_1.split()) - set(transcription_2.split()) - set(transcription_3.split()), | |
"Model 2 Unique Words": set(transcription_2.split()) - set(transcription_1.split()) - set(transcription_3.split()), | |
"Model 3 Unique Words": set(transcription_3.split()) - set(transcription_1.split()) - set(transcription_2.split()), | |
} | |
return comparison_result | |
# Gradio Interface | |
with gr.Blocks() as demo: | |
gr.Markdown("## Audio Transcription and Comparison") | |
audio_input = gr.Audio(type="filepath", label="Upload or Record Audio") | |
original_transcription = gr.Textbox(lines=2, label="Original Transcription (Optional)") | |
output = gr.JSON(label="Comparison Results") | |
submit_btn = gr.Button("Transcribe and Compare") | |
submit_btn.click( | |
transcribe_and_compare, | |
inputs=[audio_input, original_transcription], | |
outputs=output | |
) | |
demo.launch(debug=True) | |