File size: 3,806 Bytes
85a1190
a730d1e
1cdc427
77fb778
85a1190
a730d1e
ec6b5eb
 
 
 
 
 
 
 
 
 
 
bbe1a26
1cdc427
 
 
bbe1a26
 
 
 
 
1cdc427
bbe1a26
1cdc427
 
bbe1a26
 
1cdc427
 
85a1190
a730d1e
77fb778
09ad572
77fb778
 
 
 
 
 
 
ec6b5eb
a730d1e
 
bbe1a26
09ad572
ec6b5eb
77fb778
09ad572
77fb778
ec6b5eb
77fb778
85a1190
 
ec6b5eb
85a1190
 
09ad572
 
ec6b5eb
a730d1e
 
09ad572
77fb778
ec6b5eb
77fb778
09ad572
 
 
77fb778
a730d1e
 
85a1190
 
 
a730d1e
77fb778
a730d1e
 
 
 
 
 
85a1190
a730d1e
85a1190
 
09ad572
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import gradio as gr
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
from jiwer import wer

# Load models
whisper_pipeline_1 = pipeline(
    "automatic-speech-recognition", 
    model="maliahson/Finetuned_Whisper_Medium_Model_2"
)

whisper_pipeline_2 = pipeline(
    "automatic-speech-recognition", 
    model="openai/whisper-large-v3-turbo", 
    device=0 if torch.cuda.is_available() else "cpu"
)

# Set up openai/whisper-medium for Urdu transcription
processor = WhisperProcessor.from_pretrained("openai/whisper-medium")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium")
forced_decoder_ids = processor.get_decoder_prompt_ids(language="urdu", task="transcribe")

def transcribe_with_whisper_medium(audio_path):
    """
    Transcribe audio using the openai/whisper-medium model with forced language settings for Urdu.
    """
    inputs = processor(audio_path, return_tensors="pt", sampling_rate=16000)
    with torch.no_grad():
        # Generate the transcription using the forced decoder IDs for Urdu
        outputs = model.generate(
            inputs["input_features"], forced_decoder_ids=forced_decoder_ids
        )
    # Decode the outputs to text
    return processor.batch_decode(outputs, skip_special_tokens=True)[0]

def transcribe_and_compare(audio_path, original_transcription=None):
    """
    Transcribes an audio file using three Whisper models and compares results.
    
    Args:
        audio_path (str): Path to the audio file.
        original_transcription (str, optional): Ground truth transcription.
    Returns:
        dict: Results including transcriptions and WER calculations.
    """
    # Transcriptions from all three models
    transcription_1 = whisper_pipeline_1(audio_path)["text"]
    transcription_2 = whisper_pipeline_2(audio_path)["text"]
    transcription_3 = transcribe_with_whisper_medium(audio_path)
    
    # Prepare comparison results
    comparison_result = {
        "Model 1 Output (maliahson/Finetuned_Whisper_Medium_Model_2)": transcription_1,
        "Model 2 Output (openai/whisper-large-v3-turbo)": transcription_2,
        "Model 3 Output (Openai/whisper-medium, Urdu)": transcription_3
    }

    if original_transcription:
        # Calculate Word Error Rate (WER) for all models
        wer_1 = wer(original_transcription, transcription_1)
        wer_2 = wer(original_transcription, transcription_2)
        wer_3 = wer(original_transcription, transcription_3)
        
        # Add WER scores to results
        comparison_result["WER Model 1"] = wer_1
        comparison_result["WER Model 2"] = wer_2
        comparison_result["WER Model 3"] = wer_3
    else:
        # Compare outputs of all three models when no ground truth is provided
        comparison_result["Difference Between Models"] = {
            "Model 1 Unique Words": set(transcription_1.split()) - set(transcription_2.split()) - set(transcription_3.split()),
            "Model 2 Unique Words": set(transcription_2.split()) - set(transcription_1.split()) - set(transcription_3.split()),
            "Model 3 Unique Words": set(transcription_3.split()) - set(transcription_1.split()) - set(transcription_2.split()),
        }

    return comparison_result

# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("## Audio Transcription and Comparison")
    audio_input = gr.Audio(type="filepath", label="Upload or Record Audio")
    original_transcription = gr.Textbox(lines=2, label="Original Transcription (Optional)")
    output = gr.JSON(label="Comparison Results")
    submit_btn = gr.Button("Transcribe and Compare")

    submit_btn.click(
        transcribe_and_compare,
        inputs=[audio_input, original_transcription],
        outputs=output
    )

demo.launch(debug=True)