import gradio as gr import torch from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline from jiwer import wer # Load models whisper_pipeline_1 = pipeline( "automatic-speech-recognition", model="maliahson/Finetuned_Whisper_Medium_Model_2" ) whisper_pipeline_2 = pipeline( "automatic-speech-recognition", model="openai/whisper-large-v3-turbo", device=0 if torch.cuda.is_available() else "cpu" ) # Set up openai/whisper-medium for Urdu transcription processor = WhisperProcessor.from_pretrained("openai/whisper-medium") model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium") forced_decoder_ids = processor.get_decoder_prompt_ids(language="urdu", task="transcribe") def transcribe_with_whisper_medium(audio_path): """ Transcribe audio using the openai/whisper-medium model with forced language settings for Urdu. """ inputs = processor(audio_path, return_tensors="pt", sampling_rate=16000) with torch.no_grad(): # Generate the transcription using the forced decoder IDs for Urdu outputs = model.generate( inputs["input_features"], forced_decoder_ids=forced_decoder_ids ) # Decode the outputs to text return processor.batch_decode(outputs, skip_special_tokens=True)[0] def transcribe_and_compare(audio_path, original_transcription=None): """ Transcribes an audio file using three Whisper models and compares results. Args: audio_path (str): Path to the audio file. original_transcription (str, optional): Ground truth transcription. Returns: dict: Results including transcriptions and WER calculations. """ # Transcriptions from all three models transcription_1 = whisper_pipeline_1(audio_path)["text"] transcription_2 = whisper_pipeline_2(audio_path)["text"] transcription_3 = transcribe_with_whisper_medium(audio_path) # Prepare comparison results comparison_result = { "Model 1 Output (maliahson/Finetuned_Whisper_Medium_Model_2)": transcription_1, "Model 2 Output (openai/whisper-large-v3-turbo)": transcription_2, "Model 3 Output (Openai/whisper-medium, Urdu)": transcription_3 } if original_transcription: # Calculate Word Error Rate (WER) for all models wer_1 = wer(original_transcription, transcription_1) wer_2 = wer(original_transcription, transcription_2) wer_3 = wer(original_transcription, transcription_3) # Add WER scores to results comparison_result["WER Model 1"] = wer_1 comparison_result["WER Model 2"] = wer_2 comparison_result["WER Model 3"] = wer_3 else: # Compare outputs of all three models when no ground truth is provided comparison_result["Difference Between Models"] = { "Model 1 Unique Words": set(transcription_1.split()) - set(transcription_2.split()) - set(transcription_3.split()), "Model 2 Unique Words": set(transcription_2.split()) - set(transcription_1.split()) - set(transcription_3.split()), "Model 3 Unique Words": set(transcription_3.split()) - set(transcription_1.split()) - set(transcription_2.split()), } return comparison_result # Gradio Interface with gr.Blocks() as demo: gr.Markdown("## Audio Transcription and Comparison") audio_input = gr.Audio(type="filepath", label="Upload or Record Audio") original_transcription = gr.Textbox(lines=2, label="Original Transcription (Optional)") output = gr.JSON(label="Comparison Results") submit_btn = gr.Button("Transcribe and Compare") submit_btn.click( transcribe_and_compare, inputs=[audio_input, original_transcription], outputs=output ) demo.launch(debug=True)