Update app.py
Browse files
app.py
CHANGED
|
@@ -23,13 +23,12 @@ nlp = load_spacy_model()
|
|
| 23 |
# Helper function: Normalize date and time expressions
|
| 24 |
def normalize_data_time(text):
|
| 25 |
try:
|
| 26 |
-
# Use dateutil to parse the time
|
| 27 |
parsed_date = dateutil.parser.parse(text, fuzzy=True)
|
| 28 |
-
return parsed_date
|
| 29 |
except (ValueError, TypeError):
|
| 30 |
-
return None
|
| 31 |
|
| 32 |
-
# Calculate duration between two times
|
| 33 |
def calculate_duration(start_time, end_time):
|
| 34 |
if isinstance(start_time, datetime) and isinstance(end_time, datetime):
|
| 35 |
delta = end_time - start_time
|
|
@@ -39,68 +38,58 @@ def calculate_duration(start_time, end_time):
|
|
| 39 |
return f"{hours}h {minutes}m"
|
| 40 |
return "Unknown"
|
| 41 |
|
| 42 |
-
# Main function
|
| 43 |
def process_text(text):
|
| 44 |
doc = nlp(text)
|
| 45 |
|
| 46 |
-
# Extract named entities
|
| 47 |
entities = {
|
| 48 |
-
"PERSON": [],
|
| 49 |
-
"DATE": [],
|
| 50 |
-
"TIME": [],
|
| 51 |
-
"GPE": [],
|
| 52 |
-
"EVENT": []
|
| 53 |
}
|
| 54 |
|
| 55 |
-
# Extract information from spaCy's named entity recognition
|
| 56 |
for ent in doc.ents:
|
| 57 |
if ent.label_ in entities:
|
| 58 |
entities[ent.label_].append(ent.text)
|
| 59 |
-
# Normalize dates
|
| 60 |
if ent.label_ == "DATE":
|
| 61 |
normalized = normalize_data_time(ent.text)
|
| 62 |
entities[ent.label_][-1] = normalized.strftime("%Y-%m-%d") if normalized else ent.text
|
| 63 |
|
| 64 |
-
# Infer the event (look for verbs or noun phrases)
|
| 65 |
event = None
|
| 66 |
for token in doc:
|
| 67 |
if token.pos_ in ["VERB", "NOUN"] and token.dep_ in ["ROOT", "dobj", "nsubj"]:
|
| 68 |
-
event = token.lemma_
|
| 69 |
break
|
| 70 |
if event:
|
| 71 |
entities["EVENT"].append(event)
|
| 72 |
|
| 73 |
-
# Extract start time, end time, and calculate duration
|
| 74 |
start_time = None
|
| 75 |
end_time = None
|
| 76 |
duration = "Unknown"
|
| 77 |
|
| 78 |
-
#
|
| 79 |
-
time_pattern = re.compile(r"(\d{1,2}(?::\d{2})?\s*(?:AM|PM|am|pm)?)\s*(?:to|until
|
| 80 |
match = time_pattern.search(text)
|
| 81 |
|
| 82 |
if match:
|
| 83 |
-
# Found a range like "3PM to 4PM"
|
| 84 |
start_str, end_str = match.groups()
|
| 85 |
start_time = normalize_data_time(start_str)
|
| 86 |
end_time = normalize_data_time(end_str)
|
| 87 |
else:
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
|
|
|
|
|
|
| 95 |
if start_time and end_time:
|
| 96 |
duration = calculate_duration(start_time, end_time)
|
| 97 |
-
start_time_str = start_time.strftime("%H:%M")
|
| 98 |
-
end_time_str = end_time.strftime("%H:%M")
|
| 99 |
-
else:
|
| 100 |
-
start_time_str = start_time.strftime("%H:%M") if start_time else "None"
|
| 101 |
-
end_time_str = "None"
|
| 102 |
|
| 103 |
-
# Construct structured output
|
| 104 |
result = {
|
| 105 |
"Event": entities["EVENT"][0] if entities["EVENT"] else "Unknown",
|
| 106 |
"People": ", ".join(entities["PERSON"]) if entities["PERSON"] else "None",
|
|
@@ -111,7 +100,6 @@ def process_text(text):
|
|
| 111 |
"Location": ", ".join(entities["GPE"]) if entities["GPE"] else "None"
|
| 112 |
}
|
| 113 |
|
| 114 |
-
# Format the output
|
| 115 |
output = (
|
| 116 |
f"Event: {result['Event']}\n"
|
| 117 |
f"People: {result['People']}\n"
|
|
@@ -123,7 +111,7 @@ def process_text(text):
|
|
| 123 |
)
|
| 124 |
return output
|
| 125 |
|
| 126 |
-
# Gradio interface
|
| 127 |
demo = gr.Interface(
|
| 128 |
fn=process_text,
|
| 129 |
inputs="text",
|
|
@@ -132,8 +120,6 @@ demo = gr.Interface(
|
|
| 132 |
description="Input text or OCR-extracted text to normalize and extract event with start time, end time, and duration"
|
| 133 |
)
|
| 134 |
|
| 135 |
-
# Launch the app
|
| 136 |
if __name__ == "__main__":
|
| 137 |
print("Launching Gradio application...")
|
| 138 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|
| 139 |
-
|
|
|
|
| 23 |
# Helper function: Normalize date and time expressions
|
| 24 |
def normalize_data_time(text):
|
| 25 |
try:
|
|
|
|
| 26 |
parsed_date = dateutil.parser.parse(text, fuzzy=True)
|
| 27 |
+
return parsed_date
|
| 28 |
except (ValueError, TypeError):
|
| 29 |
+
return None
|
| 30 |
|
| 31 |
+
# Calculate duration between two times
|
| 32 |
def calculate_duration(start_time, end_time):
|
| 33 |
if isinstance(start_time, datetime) and isinstance(end_time, datetime):
|
| 34 |
delta = end_time - start_time
|
|
|
|
| 38 |
return f"{hours}h {minutes}m"
|
| 39 |
return "Unknown"
|
| 40 |
|
| 41 |
+
# Main processing function
|
| 42 |
def process_text(text):
|
| 43 |
doc = nlp(text)
|
| 44 |
|
|
|
|
| 45 |
entities = {
|
| 46 |
+
"PERSON": [],
|
| 47 |
+
"DATE": [],
|
| 48 |
+
"TIME": [],
|
| 49 |
+
"GPE": [],
|
| 50 |
+
"EVENT": []
|
| 51 |
}
|
| 52 |
|
|
|
|
| 53 |
for ent in doc.ents:
|
| 54 |
if ent.label_ in entities:
|
| 55 |
entities[ent.label_].append(ent.text)
|
|
|
|
| 56 |
if ent.label_ == "DATE":
|
| 57 |
normalized = normalize_data_time(ent.text)
|
| 58 |
entities[ent.label_][-1] = normalized.strftime("%Y-%m-%d") if normalized else ent.text
|
| 59 |
|
|
|
|
| 60 |
event = None
|
| 61 |
for token in doc:
|
| 62 |
if token.pos_ in ["VERB", "NOUN"] and token.dep_ in ["ROOT", "dobj", "nsubj"]:
|
| 63 |
+
event = token.lemma_
|
| 64 |
break
|
| 65 |
if event:
|
| 66 |
entities["EVENT"].append(event)
|
| 67 |
|
|
|
|
| 68 |
start_time = None
|
| 69 |
end_time = None
|
| 70 |
duration = "Unknown"
|
| 71 |
|
| 72 |
+
# Try to match range
|
| 73 |
+
time_pattern = re.compile(r"(\d{1,2}(?::\d{2})?\s*(?:AM|PM|am|pm)?)\s*(?:to|until|-|–)\s*(\d{1,2}(?::\d{2})?\s*(?:AM|PM|am|pm)?)", re.IGNORECASE)
|
| 74 |
match = time_pattern.search(text)
|
| 75 |
|
| 76 |
if match:
|
|
|
|
| 77 |
start_str, end_str = match.groups()
|
| 78 |
start_time = normalize_data_time(start_str)
|
| 79 |
end_time = normalize_data_time(end_str)
|
| 80 |
else:
|
| 81 |
+
# Fallback: extract multiple AM/PM time entries
|
| 82 |
+
ampm_matches = re.findall(r"\d{1,2}(?::\d{2})?\s*[AaPp][Mm]", text)
|
| 83 |
+
if len(ampm_matches) >= 1:
|
| 84 |
+
start_time = normalize_data_time(ampm_matches[0])
|
| 85 |
+
if len(ampm_matches) >= 2:
|
| 86 |
+
end_time = normalize_data_time(ampm_matches[1])
|
| 87 |
+
|
| 88 |
+
start_time_str = start_time.strftime("%H:%M") if start_time else "None"
|
| 89 |
+
end_time_str = end_time.strftime("%H:%M") if end_time else "None"
|
| 90 |
if start_time and end_time:
|
| 91 |
duration = calculate_duration(start_time, end_time)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
|
|
|
| 93 |
result = {
|
| 94 |
"Event": entities["EVENT"][0] if entities["EVENT"] else "Unknown",
|
| 95 |
"People": ", ".join(entities["PERSON"]) if entities["PERSON"] else "None",
|
|
|
|
| 100 |
"Location": ", ".join(entities["GPE"]) if entities["GPE"] else "None"
|
| 101 |
}
|
| 102 |
|
|
|
|
| 103 |
output = (
|
| 104 |
f"Event: {result['Event']}\n"
|
| 105 |
f"People: {result['People']}\n"
|
|
|
|
| 111 |
)
|
| 112 |
return output
|
| 113 |
|
| 114 |
+
# Gradio interface
|
| 115 |
demo = gr.Interface(
|
| 116 |
fn=process_text,
|
| 117 |
inputs="text",
|
|
|
|
| 120 |
description="Input text or OCR-extracted text to normalize and extract event with start time, end time, and duration"
|
| 121 |
)
|
| 122 |
|
|
|
|
| 123 |
if __name__ == "__main__":
|
| 124 |
print("Launching Gradio application...")
|
| 125 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|
|
|