Spaces:
Runtime error
Runtime error
File size: 4,941 Bytes
71f163e 7adb1c9 71f163e 2452c2f 71f163e 2452c2f 71f163e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import torch
from transformers import pipeline
import librosa
from datetime import datetime
from deep_translator import GoogleTranslator
from typing import Dict, Union
from gliner import GLiNER
import gradio as gr
# Load transcription models
whisper_pipeline_agri = pipeline("automatic-speech-recognition", model="maliahson/Finetuned_Whisper_Medium_Model_2")
device = 0 if torch.cuda.is_available() else "cpu"
# Initialize GLiNER for information extraction
gliner_model = GLiNER.from_pretrained("xomad/gliner-model-merge-large-v1.0").to("cpu")
def merge_entities(entities):
if not entities:
return []
merged = []
current = entities[0]
for next_entity in entities[1:]:
if next_entity['entity'] == current['entity'] and (next_entity['start'] == current['end'] + 1 or next_entity['start'] == current['end']):
current['word'] += ' ' + next_entity['word']
current['end'] = next_entity['end']
else:
merged.append(current)
current = next_entity
merged.append(current)
return merged
def transcribe_audio(audio_path):
"""
Transcribe a local audio file using the Whisper pipeline, log timing, and save transcription to a file.
"""
try:
# Log start time
start_time = datetime.now()
# Ensure audio is mono and resampled to 16kHz
audio, sr = librosa.load(audio_path, sr=16000, mono=True)
# Set the pad token for batching
whisper_pipeline_agri.tokenizer.pad_token_id = whisper_pipeline_agri.model.config.eos_token_id
# Perform transcription
transcription = whisper_pipeline_agri(audio, batch_size=8)["text"]
# Log end time
end_time = datetime.now()
return transcription
except Exception as e:
return f"Error processing audio: {e}"
def translate_text_to_english(text):
"""
Translate text into English using GoogleTranslator.
"""
try:
# Perform translation
translated_text = GoogleTranslator(source='auto', target='en').translate(text)
return translated_text
except Exception as e:
return f"Error during translation: {e}"
def extract_information(prompt: str, text: str, threshold: float, nested_ner: bool) -> Dict[str, Union[str, int, float]]:
"""
Extract entities from the English text using GLiNER model.
"""
try:
text = prompt + "\n" + text
entities = [
{
"entity": entity["label"],
"word": entity["text"],
"start": entity["start"],
"end": entity["end"],
"score": 0,
}
for entity in gliner_model.predict_entities(
text, ["match"], flat_ner=not nested_ner, threshold=threshold
)
]
merged_entities = merge_entities(entities)
return {"text": text, "entities": merged_entities}
except Exception as e:
return {"error": f"Information extraction failed: {e}"}
def pipeline_fn(audio, prompt, threshold, nested_ner):
"""
Combine transcription, translation, and information extraction in a single pipeline.
"""
transcription = transcribe_audio(audio)
if "Error" in transcription:
return transcription, "", "", {}
translated_text = translate_text_to_english(transcription)
if "Error" in translated_text:
return transcription, translated_text, "", {}
info_extraction = extract_information(prompt, translated_text, threshold, nested_ner)
return transcription, translated_text, info_extraction
# Gradio Interface
with gr.Blocks(title="Audio Processing and Information Extraction") as interface:
gr.Markdown("## Audio Transcription, Translation, and Information Extraction")
with gr.Row():
# Fixed: removed 'source' argument from gr.Audio
audio_input = gr.Audio(type="filepath", label="Upload Audio File")
prompt_input = gr.Textbox(label="Prompt for Information Extraction", placeholder="Enter your prompt here")
with gr.Row():
threshold_slider = gr.Slider(0, 1, value=0.3, step=0.01, label="NER Threshold")
nested_ner_checkbox = gr.Checkbox(label="Enable Nested NER")
with gr.Row():
transcription_output = gr.Textbox(label="Transcription (Urdu)", interactive=False) # Corrected to interactive=False
translation_output = gr.Textbox(label="Translation (English)", interactive=False) # Corrected to interactive=False
with gr.Row():
extraction_output = gr.HighlightedText(label="Extracted Information")
process_button = gr.Button("Process Audio")
process_button.click(
fn=pipeline_fn,
inputs=[audio_input, prompt_input, threshold_slider, nested_ner_checkbox],
outputs=[transcription_output, translation_output, extraction_output],
)
if __name__ == "__main__":
interface.launch()
|