spaCy-C / app.py
Zoe911's picture
Update app.py
1ce3e50 verified
raw
history blame
4.48 kB
import gradio as gr
import spacy
import dateutil.parser
import re
import os
import sys
from datetime import datetime, timedelta
# load the spaCy model
try:
nlp =spacy.load("en_core_web_lg")
except OSError:
print("Model not found. Downloading en_core_web_lg...")
os.system(f"{sys.executable} -m spacy download en_core_web_lg --no-cache")
nlp =spacy.load("en_core_web_lg")
# heler function :Normalize date and time expressions
def normalize_data_time(text):
try:
# use datutil to parse the time
parsed_date = dateutil.parser.parse(text, fuzzy=True)
return parsed_date.strftime("%Y-%m-%d %H:%M")
except (ValueError, TypeError):
return text #Return original text if parsing fails
# Calculate duration between two times (in hours and minutes)
def calculate_duration(start_time, end_time):
if start_time and end_time:
delta = end_time - start_time
total_minutes = int(delta.total_seconds() / 60)
hours = total_minutes // 60
minutes = total_minutes % 60
return f"{hours}h {minutes}m"
return "Unknown"
#main function
def process_text(text):
doc = nlp(text)
#extract named entities
entities = {
"PERSON": [], # name
"DATE": [], # date
"TIME": [], # times
"GPE": [],# location
"EVENT": [] # Events
}
# extract information from spaCy's named entity recognition
for ent in doc.ents:
if ent.label_ in entities:
entities[ent.label_].append(ent.text)
#normalize dates
if ent.label_ == "DATE":
normalized = normalize_data_time(ent.text)
entities[ent.label_][-1] = normalized.strftime("%Y-%m-%d") if normalized else ent.text
#infer the event(look for verbs or noun phrases)
event = None
for token in doc:
if token.pos_ in ["VERB","NOUN"] and token.dep_ in["ROOT","dobj","nsubj"]:
event = token.lemma_ # use the lemmatized form
break
if event :
entities["EVENT"].append(event)
#extract start time, end time, and calculate duration
start_time =None
end_time = None
duration ="Unknown"
#look for patterns like "from 3P to 4PM" or "3PM to 5PM"
time_pattern = re.compile(r"(\d{1,2}(?::\d{2})?\d*(?:AM|PM|am|pm)?)\s*(?:to|until|-)\s*(\d{1,2}(?::\d{2})?\s*(?:AM|PM|am|pm)?)", re.IGNORECASE)
match = time_pattern.search(text)
if match:
# found a rangle like "3PM to 4PM"
start_str, end_str = match.groups()
start_time = normalize_data_time(start_str)
end_time = normalize_data_time(end_str)
else:
#if no range, use the first TIME entity as start time
if entities["TIME"]:
start_time = normalize_data_time(entities["TIME"][0])
#caulculate duration if both start and end times are available
if start_time and end_time:
duration = calculate_duration(start_time, end_time)
# ensure start and end times are on the same day
start_time_str = start_time.strftime("%H:%M")
end_time_str = end_time.strftime("%H:%M")
else:
start_time_str =start_time.stftime("%H:%M") if start_time else "None"
end_time_str = "None"
# construct structured output
result ={
"Event": entities["EVENT"][0] if entities["EVENT"] else "Unknown",
"People": ",".join(entities["PERSON"]) if entities["PERSON"] else "None",
"Date": ", ".join(entities["DATE"]) if entities["DATE"] else "None",
"Start Time": start_time_str,
"End Time": end_time_str,
"Duration": duration,
"Location":", ".join(entities["GPE"]) if entities["GPE"] else "None"
}
# format the output
output =(
f"Event: {result['Event']}\n"
f"People: {result['People']}\n"
f"Date: {result['Date']}\n"
f"Start Time: {result['Start Time']}\n"
f"End Time: {result['End Time']}\n"
f"Duration: {result['Duration']}\n"
f"Location: {result['Location']}\n"
)
return output
# gradio interface
demo = gr.Interface(
fn = process_text,
inputs = "text",
outputs = "text",
title = "Text Normalization ",
description = "Input text or OCR_extracted text to normalize and extract event with start time, end time, and duration"
)
demo.launch()