File size: 4,112 Bytes
a842478 cef0cb9 1ce3e50 2dea340 0259b0b 1ce3e50 0259b0b b9a25ba 356ac0f 0259b0b 356ac0f 0259b0b 356ac0f 0259b0b 2dea340 36e8407 2dea340 36e8407 2dea340 36e8407 2dea340 0259b0b 2dea340 36e8407 cef0cb9 2dea340 36e8407 2dea340 0259b0b 2dea340 0259b0b 36e8407 0259b0b 2dea340 0259b0b 2dea340 36e8407 2dea340 36e8407 2dea340 0259b0b 2dea340 0259b0b 2dea340 0259b0b 2dea340 0259b0b 2dea340 36e8407 0259b0b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
import gradio as gr
import spacy
import dateutil.parser
import re
import os
import sys
from datetime import datetime, timedelta
# Load the spaCy model
def load_spacy_model():
try:
nlp = spacy.load("en_core_web_sm")
print("Model loaded successfully.")
except OSError:
print("Model not found. Downloading en_core_web_sm...")
os.system(f"{sys.executable} -m spacy download en_core_web_sm --no-cache")
nlp = spacy.load("en_core_web_sm")
print("Model downloaded and loaded successfully.")
return nlp
nlp = load_spacy_model()
# Helper function: Normalize date and time expressions
def normalize_data_time(text):
try:
parsed_date = dateutil.parser.parse(text, fuzzy=True)
return parsed_date
except (ValueError, TypeError):
return None
# Calculate duration between two times
def calculate_duration(start_time, end_time):
if isinstance(start_time, datetime) and isinstance(end_time, datetime):
delta = end_time - start_time
total_minutes = int(delta.total_seconds() / 60)
hours = total_minutes // 60
minutes = total_minutes % 60
return f"{hours}h {minutes}m"
return "Unknown"
# Main processing function
def process_text(text):
doc = nlp(text)
entities = {
"PERSON": [],
"DATE": [],
"TIME": [],
"GPE": [],
"EVENT": []
}
for ent in doc.ents:
if ent.label_ in entities:
entities[ent.label_].append(ent.text)
if ent.label_ == "DATE":
normalized = normalize_data_time(ent.text)
entities[ent.label_][-1] = normalized.strftime("%Y-%m-%d") if normalized else ent.text
event = None
for token in doc:
if token.pos_ in ["VERB", "NOUN"] and token.dep_ in ["ROOT", "dobj", "nsubj"]:
event = token.lemma_
break
if event:
entities["EVENT"].append(event)
start_time = None
end_time = None
duration = "Unknown"
# Try to match range
time_pattern = re.compile(r"(\d{1,2}(?::\d{2})?\s*(?:AM|PM|am|pm)?)\s*(?:to|until|-|–)\s*(\d{1,2}(?::\d{2})?\s*(?:AM|PM|am|pm)?)", re.IGNORECASE)
match = time_pattern.search(text)
if match:
start_str, end_str = match.groups()
start_time = normalize_data_time(start_str)
end_time = normalize_data_time(end_str)
else:
# Fallback: extract multiple AM/PM time entries
ampm_matches = re.findall(r"\d{1,2}(?::\d{2})?\s*[AaPp][Mm]", text)
if len(ampm_matches) >= 1:
start_time = normalize_data_time(ampm_matches[0])
if len(ampm_matches) >= 2:
end_time = normalize_data_time(ampm_matches[1])
start_time_str = start_time.strftime("%H:%M") if start_time else "None"
end_time_str = end_time.strftime("%H:%M") if end_time else "None"
if start_time and end_time:
duration = calculate_duration(start_time, end_time)
result = {
"Event": entities["EVENT"][0] if entities["EVENT"] else "Unknown",
"People": ", ".join(entities["PERSON"]) if entities["PERSON"] else "None",
"Date": ", ".join(entities["DATE"]) if entities["DATE"] else "None",
"Start Time": start_time_str,
"End Time": end_time_str,
"Duration": duration,
"Location": ", ".join(entities["GPE"]) if entities["GPE"] else "None"
}
output = (
f"Event: {result['Event']}\n"
f"People: {result['People']}\n"
f"Date: {result['Date']}\n"
f"Start Time: {result['Start Time']}\n"
f"End Time: {result['End Time']}\n"
f"Duration: {result['Duration']}\n"
f"Location: {result['Location']}\n"
)
return output
# Gradio interface
demo = gr.Interface(
fn=process_text,
inputs="text",
outputs="text",
title="Text Normalization",
description="Input text or OCR-extracted text to normalize and extract event with start time, end time, and duration"
)
if __name__ == "__main__":
print("Launching Gradio application...")
demo.launch(server_name="0.0.0.0", server_port=7860)
|