File size: 4,484 Bytes
a842478 cef0cb9 1ce3e50 2dea340 1ce3e50 2dea340 84dfc28 1ce3e50 84dfc28 2dea340 a842478 2dea340 cef0cb9 2dea340 cef0cb9 2dea340 a842478 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import gradio as gr
import spacy
import dateutil.parser
import re
import os
import sys
from datetime import datetime, timedelta
# load the spaCy model
try:
nlp =spacy.load("en_core_web_lg")
except OSError:
print("Model not found. Downloading en_core_web_lg...")
os.system(f"{sys.executable} -m spacy download en_core_web_lg --no-cache")
nlp =spacy.load("en_core_web_lg")
# heler function :Normalize date and time expressions
def normalize_data_time(text):
try:
# use datutil to parse the time
parsed_date = dateutil.parser.parse(text, fuzzy=True)
return parsed_date.strftime("%Y-%m-%d %H:%M")
except (ValueError, TypeError):
return text #Return original text if parsing fails
# Calculate duration between two times (in hours and minutes)
def calculate_duration(start_time, end_time):
if start_time and end_time:
delta = end_time - start_time
total_minutes = int(delta.total_seconds() / 60)
hours = total_minutes // 60
minutes = total_minutes % 60
return f"{hours}h {minutes}m"
return "Unknown"
#main function
def process_text(text):
doc = nlp(text)
#extract named entities
entities = {
"PERSON": [], # name
"DATE": [], # date
"TIME": [], # times
"GPE": [],# location
"EVENT": [] # Events
}
# extract information from spaCy's named entity recognition
for ent in doc.ents:
if ent.label_ in entities:
entities[ent.label_].append(ent.text)
#normalize dates
if ent.label_ == "DATE":
normalized = normalize_data_time(ent.text)
entities[ent.label_][-1] = normalized.strftime("%Y-%m-%d") if normalized else ent.text
#infer the event(look for verbs or noun phrases)
event = None
for token in doc:
if token.pos_ in ["VERB","NOUN"] and token.dep_ in["ROOT","dobj","nsubj"]:
event = token.lemma_ # use the lemmatized form
break
if event :
entities["EVENT"].append(event)
#extract start time, end time, and calculate duration
start_time =None
end_time = None
duration ="Unknown"
#look for patterns like "from 3P to 4PM" or "3PM to 5PM"
time_pattern = re.compile(r"(\d{1,2}(?::\d{2})?\d*(?:AM|PM|am|pm)?)\s*(?:to|until|-)\s*(\d{1,2}(?::\d{2})?\s*(?:AM|PM|am|pm)?)", re.IGNORECASE)
match = time_pattern.search(text)
if match:
# found a rangle like "3PM to 4PM"
start_str, end_str = match.groups()
start_time = normalize_data_time(start_str)
end_time = normalize_data_time(end_str)
else:
#if no range, use the first TIME entity as start time
if entities["TIME"]:
start_time = normalize_data_time(entities["TIME"][0])
#caulculate duration if both start and end times are available
if start_time and end_time:
duration = calculate_duration(start_time, end_time)
# ensure start and end times are on the same day
start_time_str = start_time.strftime("%H:%M")
end_time_str = end_time.strftime("%H:%M")
else:
start_time_str =start_time.stftime("%H:%M") if start_time else "None"
end_time_str = "None"
# construct structured output
result ={
"Event": entities["EVENT"][0] if entities["EVENT"] else "Unknown",
"People": ",".join(entities["PERSON"]) if entities["PERSON"] else "None",
"Date": ", ".join(entities["DATE"]) if entities["DATE"] else "None",
"Start Time": start_time_str,
"End Time": end_time_str,
"Duration": duration,
"Location":", ".join(entities["GPE"]) if entities["GPE"] else "None"
}
# format the output
output =(
f"Event: {result['Event']}\n"
f"People: {result['People']}\n"
f"Date: {result['Date']}\n"
f"Start Time: {result['Start Time']}\n"
f"End Time: {result['End Time']}\n"
f"Duration: {result['Duration']}\n"
f"Location: {result['Location']}\n"
)
return output
# gradio interface
demo = gr.Interface(
fn = process_text,
inputs = "text",
outputs = "text",
title = "Text Normalization ",
description = "Input text or OCR_extracted text to normalize and extract event with start time, end time, and duration"
)
demo.launch()
|