|
|
import gradio as gr |
|
|
import spacy |
|
|
import dateutil.parser |
|
|
import re |
|
|
import os |
|
|
import sys |
|
|
from datetime import datetime, timedelta |
|
|
|
|
|
|
|
|
def load_spacy_model(): |
|
|
try: |
|
|
nlp = spacy.load("en_core_web_sm") |
|
|
print("Model loaded successfully.") |
|
|
except OSError: |
|
|
print("Model not found. Downloading en_core_web_sm...") |
|
|
os.system(f"{sys.executable} -m spacy download en_core_web_sm --no-cache") |
|
|
nlp = spacy.load("en_core_web_sm") |
|
|
print("Model downloaded and loaded successfully.") |
|
|
return nlp |
|
|
|
|
|
nlp = load_spacy_model() |
|
|
|
|
|
|
|
|
def normalize_data_time(text): |
|
|
try: |
|
|
parsed_date = dateutil.parser.parse(text, fuzzy=True) |
|
|
return parsed_date |
|
|
except (ValueError, TypeError): |
|
|
return None |
|
|
|
|
|
|
|
|
def calculate_duration(start_time, end_time): |
|
|
if isinstance(start_time, datetime) and isinstance(end_time, datetime): |
|
|
delta = end_time - start_time |
|
|
total_minutes = int(delta.total_seconds() / 60) |
|
|
hours = total_minutes // 60 |
|
|
minutes = total_minutes % 60 |
|
|
return f"{hours}h {minutes}m" |
|
|
return "Unknown" |
|
|
|
|
|
|
|
|
def process_text(text): |
|
|
doc = nlp(text) |
|
|
|
|
|
entities = { |
|
|
"PERSON": [], |
|
|
"DATE": [], |
|
|
"TIME": [], |
|
|
"GPE": [], |
|
|
"EVENT": [] |
|
|
} |
|
|
|
|
|
for ent in doc.ents: |
|
|
if ent.label_ in entities: |
|
|
entities[ent.label_].append(ent.text) |
|
|
if ent.label_ == "DATE": |
|
|
normalized = normalize_data_time(ent.text) |
|
|
entities[ent.label_][-1] = normalized.strftime("%Y-%m-%d") if normalized else ent.text |
|
|
|
|
|
event = None |
|
|
for token in doc: |
|
|
if token.pos_ in ["VERB", "NOUN"] and token.dep_ in ["ROOT", "dobj", "nsubj"]: |
|
|
event = token.lemma_ |
|
|
break |
|
|
if event: |
|
|
entities["EVENT"].append(event) |
|
|
|
|
|
start_time = None |
|
|
end_time = None |
|
|
duration = "Unknown" |
|
|
|
|
|
|
|
|
time_pattern = re.compile(r"(\d{1,2}(?::\d{2})?\s*(?:AM|PM|am|pm)?)\s*(?:to|until|-|–)\s*(\d{1,2}(?::\d{2})?\s*(?:AM|PM|am|pm)?)", re.IGNORECASE) |
|
|
match = time_pattern.search(text) |
|
|
|
|
|
if match: |
|
|
start_str, end_str = match.groups() |
|
|
start_time = normalize_data_time(start_str) |
|
|
end_time = normalize_data_time(end_str) |
|
|
else: |
|
|
|
|
|
ampm_matches = re.findall(r"\d{1,2}(?::\d{2})?\s*[AaPp][Mm]", text) |
|
|
if len(ampm_matches) >= 1: |
|
|
start_time = normalize_data_time(ampm_matches[0]) |
|
|
if len(ampm_matches) >= 2: |
|
|
end_time = normalize_data_time(ampm_matches[1]) |
|
|
|
|
|
start_time_str = start_time.strftime("%H:%M") if start_time else "None" |
|
|
end_time_str = end_time.strftime("%H:%M") if end_time else "None" |
|
|
if start_time and end_time: |
|
|
duration = calculate_duration(start_time, end_time) |
|
|
|
|
|
result = { |
|
|
"Event": entities["EVENT"][0] if entities["EVENT"] else "Unknown", |
|
|
"People": ", ".join(entities["PERSON"]) if entities["PERSON"] else "None", |
|
|
"Date": ", ".join(entities["DATE"]) if entities["DATE"] else "None", |
|
|
"Start Time": start_time_str, |
|
|
"End Time": end_time_str, |
|
|
"Duration": duration, |
|
|
"Location": ", ".join(entities["GPE"]) if entities["GPE"] else "None" |
|
|
} |
|
|
|
|
|
output = ( |
|
|
f"Event: {result['Event']}\n" |
|
|
f"People: {result['People']}\n" |
|
|
f"Date: {result['Date']}\n" |
|
|
f"Start Time: {result['Start Time']}\n" |
|
|
f"End Time: {result['End Time']}\n" |
|
|
f"Duration: {result['Duration']}\n" |
|
|
f"Location: {result['Location']}\n" |
|
|
) |
|
|
return output |
|
|
|
|
|
|
|
|
demo = gr.Interface( |
|
|
fn=process_text, |
|
|
inputs="text", |
|
|
outputs="text", |
|
|
title="Text Normalization", |
|
|
description="Input text or OCR-extracted text to normalize and extract event with start time, end time, and duration" |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
print("Launching Gradio application...") |
|
|
demo.launch(server_name="0.0.0.0", server_port=7860) |
|
|
|