File size: 4,923 Bytes
0e3078f 5f781a7 0e3078f 64becca 0e3078f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import torch
from transformers import pipeline
import librosa
from datetime import datetime
from deep_translator import GoogleTranslator
from typing import Dict, Union
from gliner import GLiNER
import gradio as gr
# Model and device configuration for transcription
MODEL_NAME = "openai/whisper-large-v3-turbo"
device = 0 if torch.cuda.is_available() else "cpu"
# Initialize Whisper pipeline
pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=30,
device=device,
)
# Initialize GLiNER for information extraction
gliner_model = GLiNER.from_pretrained("xomad/gliner-model-merge-large-v1.0").to("cpu")
def merge_entities(entities):
if not entities:
return []
merged = []
current = entities[0]
for next_entity in entities[1:]:
if next_entity['entity'] == current['entity'] and (next_entity['start'] == current['end'] + 1 or next_entity['start'] == current['end']):
current['word'] += ' ' + next_entity['word']
current['end'] = next_entity['end']
else:
merged.append(current)
current = next_entity
merged.append(current)
return merged
def transcribe_audio(audio_path):
"""
Transcribe a local audio file using the Whisper pipeline, log timing, and save transcription to a file.
"""
try:
# Log start time
start_time = datetime.now()
# Ensure audio is mono and resampled to 16kHz
audio, sr = librosa.load(audio_path, sr=16000, mono=True)
# Perform transcription
transcription = pipe(audio, batch_size=8, generate_kwargs={"language": "urdu"})["text"]
# Log end time
end_time = datetime.now()
return transcription
except Exception as e:
return f"Error processing audio: {e}"
def translate_text_to_english(text):
"""
Translate text into English using GoogleTranslator.
"""
try:
# Perform translation
translated_text = GoogleTranslator(source='auto', target='en').translate(text)
return translated_text
except Exception as e:
return f"Error during translation: {e}"
def extract_information(prompt: str, text: str, threshold: float, nested_ner: bool) -> Dict[str, Union[str, int, float]]:
"""
Extract entities from the English text using GLiNER model.
"""
try:
text = prompt + "\n" + text
entities = [
{
"entity": entity["label"],
"word": entity["text"],
"start": entity["start"],
"end": entity["end"],
"score": 0,
}
for entity in gliner_model.predict_entities(
text, ["match"], flat_ner=not nested_ner, threshold=threshold
)
]
merged_entities = merge_entities(entities)
return {"text": text, "entities": merged_entities}
except Exception as e:
return {"error": f"Information extraction failed: {e}"}
def pipeline_fn(audio, prompt, threshold, nested_ner):
"""
Combine transcription, translation, and information extraction in a single pipeline.
"""
transcription = transcribe_audio(audio)
if "Error" in transcription:
return transcription, "", "", {}
translated_text = translate_text_to_english(transcription)
if "Error" in translated_text:
return transcription, translated_text, "", {}
info_extraction = extract_information(prompt, translated_text, threshold, nested_ner)
return transcription, translated_text, info_extraction
# Gradio Interface
with gr.Blocks(title="Audio Processing and Information Extraction") as interface:
gr.Markdown("## Audio Transcription, Translation, and Information Extraction")
with gr.Row():
# Fixed: removed 'source' argument from gr.Audio
audio_input = gr.Audio(type="filepath", label="Upload Audio File")
prompt_input = gr.Textbox(label="Prompt for Information Extraction", placeholder="Enter your prompt here")
with gr.Row():
threshold_slider = gr.Slider(0, 1, value=0.3, step=0.01, label="NER Threshold")
nested_ner_checkbox = gr.Checkbox(label="Enable Nested NER")
with gr.Row():
transcription_output = gr.Textbox(label="Transcription (Urdu)", interactive=False) # Corrected to interactive=False
translation_output = gr.Textbox(label="Translation (English)", interactive=False) # Corrected to interactive=False
with gr.Row():
extraction_output = gr.HighlightedText(label="Extracted Information")
process_button = gr.Button("Process Audio")
process_button.click(
fn=pipeline_fn,
inputs=[audio_input, prompt_input, threshold_slider, nested_ner_checkbox],
outputs=[transcription_output, translation_output, extraction_output],
)
if __name__ == "__main__":
interface.launch()
|