File size: 4,941 Bytes
71f163e
 
 
 
 
 
 
 
 
 
7adb1c9
71f163e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2452c2f
 
 
71f163e
 
 
 
 
 
 
 
 
 
 
2452c2f
71f163e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import torch
from transformers import pipeline
import librosa
from datetime import datetime
from deep_translator import GoogleTranslator
from typing import Dict, Union
from gliner import GLiNER
import gradio as gr

# Load transcription models
whisper_pipeline_agri = pipeline("automatic-speech-recognition", model="maliahson/Finetuned_Whisper_Medium_Model_2")
device = 0 if torch.cuda.is_available() else "cpu"

# Initialize GLiNER for information extraction
gliner_model = GLiNER.from_pretrained("xomad/gliner-model-merge-large-v1.0").to("cpu")

def merge_entities(entities):
    if not entities:
        return []
    merged = []
    current = entities[0]
    for next_entity in entities[1:]:
        if next_entity['entity'] == current['entity'] and (next_entity['start'] == current['end'] + 1 or next_entity['start'] == current['end']):
            current['word'] += ' ' + next_entity['word']
            current['end'] = next_entity['end']
        else:
            merged.append(current)
            current = next_entity
    merged.append(current)
    return merged
def transcribe_audio(audio_path):
    """
    Transcribe a local audio file using the Whisper pipeline, log timing, and save transcription to a file.
    """
    try:
        # Log start time
        start_time = datetime.now()

        # Ensure audio is mono and resampled to 16kHz
        audio, sr = librosa.load(audio_path, sr=16000, mono=True)

        # Set the pad token for batching
        whisper_pipeline_agri.tokenizer.pad_token_id = whisper_pipeline_agri.model.config.eos_token_id

        # Perform transcription
        transcription = whisper_pipeline_agri(audio, batch_size=8)["text"]

        # Log end time
        end_time = datetime.now()

        return transcription

    except Exception as e:
        return f"Error processing audio: {e}"


def translate_text_to_english(text):
    """
    Translate text into English using GoogleTranslator.
    """
    try:
        # Perform translation
        translated_text = GoogleTranslator(source='auto', target='en').translate(text)
        return translated_text
    except Exception as e:
        return f"Error during translation: {e}"

def extract_information(prompt: str, text: str, threshold: float, nested_ner: bool) -> Dict[str, Union[str, int, float]]:
    """
    Extract entities from the English text using GLiNER model.
    """
    try:
        text = prompt + "\n" + text
        entities = [
            {
                "entity": entity["label"],
                "word": entity["text"],
                "start": entity["start"],
                "end": entity["end"],
                "score": 0,
            }
            for entity in gliner_model.predict_entities(
                text, ["match"], flat_ner=not nested_ner, threshold=threshold
            )
        ]
        merged_entities = merge_entities(entities)
        return {"text": text, "entities": merged_entities}
    except Exception as e:
        return {"error": f"Information extraction failed: {e}"}

def pipeline_fn(audio, prompt, threshold, nested_ner):
    """
    Combine transcription, translation, and information extraction in a single pipeline.
    """
    transcription = transcribe_audio(audio)
    if "Error" in transcription:
        return transcription, "", "", {}

    translated_text = translate_text_to_english(transcription)
    if "Error" in translated_text:
        return transcription, translated_text, "", {}

    info_extraction = extract_information(prompt, translated_text, threshold, nested_ner)
    return transcription, translated_text, info_extraction

# Gradio Interface
with gr.Blocks(title="Audio Processing and Information Extraction") as interface:
    gr.Markdown("## Audio Transcription, Translation, and Information Extraction")
    
    with gr.Row():
        # Fixed: removed 'source' argument from gr.Audio
        audio_input = gr.Audio(type="filepath", label="Upload Audio File")
        prompt_input = gr.Textbox(label="Prompt for Information Extraction", placeholder="Enter your prompt here")
    
    with gr.Row():
        threshold_slider = gr.Slider(0, 1, value=0.3, step=0.01, label="NER Threshold")
        nested_ner_checkbox = gr.Checkbox(label="Enable Nested NER")
    
    with gr.Row():
        transcription_output = gr.Textbox(label="Transcription (Urdu)", interactive=False)  # Corrected to interactive=False
        translation_output = gr.Textbox(label="Translation (English)", interactive=False)  # Corrected to interactive=False
    
    with gr.Row():
        extraction_output = gr.HighlightedText(label="Extracted Information")

    process_button = gr.Button("Process Audio")

    process_button.click(
        fn=pipeline_fn,
        inputs=[audio_input, prompt_input, threshold_slider, nested_ner_checkbox],
        outputs=[transcription_output, translation_output, extraction_output],
    )

if __name__ == "__main__":
    interface.launch()