maliahson commited on
Commit
0e3078f
·
verified ·
1 Parent(s): d26c840

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +139 -0
app.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import pipeline
3
+ import librosa
4
+ from datetime import datetime
5
+ from deep_translator import GoogleTranslator
6
+ from typing import Dict, Union
7
+ from gliner import GLiNER
8
+ import gradio as gr
9
+
10
+ # Model and device configuration for transcription
11
+ MODEL_NAME = "openai/whisper-large-v3-turbo"
12
+ device = 0 if torch.cuda.is_available() else "cpu"
13
+
14
+ # Initialize Whisper pipeline
15
+ pipe = pipeline(
16
+ task="automatic-speech-recognition",
17
+ model=MODEL_NAME,
18
+ chunk_length_s=30,
19
+ device=device,
20
+ )
21
+
22
+ # Initialize GLiNER for information extraction
23
+ gliner_model = GLiNER.from_pretrained("xomad/gliner-model-merge-large-v1.0").to("cpu")
24
+
25
+ def merge_entities(entities):
26
+ if not entities:
27
+ return []
28
+ merged = []
29
+ current = entities[0]
30
+ for next_entity in entities[1:]:
31
+ if next_entity['entity'] == current['entity'] and (next_entity['start'] == current['end'] + 1 or next_entity['start'] == current['end']):
32
+ current['word'] += ' ' + next_entity['word']
33
+ current['end'] = next_entity['end']
34
+ else:
35
+ merged.append(current)
36
+ current = next_entity
37
+ merged.append(current)
38
+ return merged
39
+
40
+ def transcribe_audio(audio_path):
41
+ """
42
+ Transcribe a local audio file using the Whisper pipeline, log timing, and save transcription to a file.
43
+ """
44
+ try:
45
+ # Log start time
46
+ start_time = datetime.now()
47
+
48
+ # Ensure audio is mono and resampled to 16kHz
49
+ audio, sr = librosa.load(audio_path, sr=16000, mono=True)
50
+
51
+ # Perform transcription
52
+ transcription = pipe(audio, batch_size=8, generate_kwargs={"language": "urdu"})["text"]
53
+
54
+ # Log end time
55
+ end_time = datetime.now()
56
+
57
+ return transcription
58
+
59
+ except Exception as e:
60
+ return f"Error processing audio: {e}"
61
+
62
+ def translate_text_to_english(text):
63
+ """
64
+ Translate text into English using GoogleTranslator.
65
+ """
66
+ try:
67
+ # Perform translation
68
+ translated_text = GoogleTranslator(source='auto', target='en').translate(text)
69
+ return translated_text
70
+ except Exception as e:
71
+ return f"Error during translation: {e}"
72
+
73
+ def extract_information(prompt: str, text: str, threshold: float, nested_ner: bool) -> Dict[str, Union[str, int, float]]:
74
+ """
75
+ Extract entities from the English text using GLiNER model.
76
+ """
77
+ try:
78
+ text = prompt + "\n" + text
79
+ entities = [
80
+ {
81
+ "entity": entity["label"],
82
+ "word": entity["text"],
83
+ "start": entity["start"],
84
+ "end": entity["end"],
85
+ "score": 0,
86
+ }
87
+ for entity in gliner_model.predict_entities(
88
+ text, ["match"], flat_ner=not nested_ner, threshold=threshold
89
+ )
90
+ ]
91
+ merged_entities = merge_entities(entities)
92
+ return {"text": text, "entities": merged_entities}
93
+ except Exception as e:
94
+ return {"error": f"Information extraction failed: {e}"}
95
+
96
+ def pipeline_fn(audio, prompt, threshold, nested_ner):
97
+ """
98
+ Combine transcription, translation, and information extraction in a single pipeline.
99
+ """
100
+ transcription = transcribe_audio(audio)
101
+ if "Error" in transcription:
102
+ return transcription, "", "", {}
103
+
104
+ translated_text = translate_text_to_english(transcription)
105
+ if "Error" in translated_text:
106
+ return transcription, translated_text, "", {}
107
+
108
+ info_extraction = extract_information(prompt, translated_text, threshold, nested_ner)
109
+ return transcription, translated_text, info_extraction
110
+
111
+ # Gradio Interface
112
+ with gr.Blocks(title="Audio Processing and Information Extraction") as interface:
113
+ gr.Markdown("## Audio Transcription, Translation, and Information Extraction")
114
+
115
+ with gr.Row():
116
+ audio_input = gr.Audio(source="upload", type="filepath", label="Upload Audio File")
117
+ prompt_input = gr.Textbox(label="Prompt for Information Extraction", placeholder="Enter your prompt here")
118
+
119
+ with gr.Row():
120
+ threshold_slider = gr.Slider(0, 1, value=0.3, step=0.01, label="NER Threshold")
121
+ nested_ner_checkbox = gr.Checkbox(label="Enable Nested NER")
122
+
123
+ with gr.Row():
124
+ transcription_output = gr.Textbox(label="Transcription (Urdu)", readonly=True)
125
+ translation_output = gr.Textbox(label="Translation (English)", readonly=True)
126
+
127
+ with gr.Row():
128
+ extraction_output = gr.HighlightedText(label="Extracted Information")
129
+
130
+ process_button = gr.Button("Process Audio")
131
+
132
+ process_button.click(
133
+ fn=pipeline_fn,
134
+ inputs=[audio_input, prompt_input, threshold_slider, nested_ner_checkbox],
135
+ outputs=[transcription_output, translation_output, extraction_output],
136
+ )
137
+
138
+ if __name__ == "__main__":
139
+ interface.launch()