import gradio as gr import spacy import dateutil.parser import re import os import sys from datetime import datetime, timedelta # load the spaCy model try: nlp =spacy.load("en_core_web_lg") except OSError: print("Model not found. Downloading en_core_web_lg...") os.system(f"{sys.executable} -m spacy download en_core_web_lg --no-cache") nlp =spacy.load("en_core_web_lg") # heler function :Normalize date and time expressions def normalize_data_time(text): try: # use datutil to parse the time parsed_date = dateutil.parser.parse(text, fuzzy=True) return parsed_date.strftime("%Y-%m-%d %H:%M") except (ValueError, TypeError): return text #Return original text if parsing fails # Calculate duration between two times (in hours and minutes) def calculate_duration(start_time, end_time): if start_time and end_time: delta = end_time - start_time total_minutes = int(delta.total_seconds() / 60) hours = total_minutes // 60 minutes = total_minutes % 60 return f"{hours}h {minutes}m" return "Unknown" #main function def process_text(text): doc = nlp(text) #extract named entities entities = { "PERSON": [], # name "DATE": [], # date "TIME": [], # times "GPE": [],# location "EVENT": [] # Events } # extract information from spaCy's named entity recognition for ent in doc.ents: if ent.label_ in entities: entities[ent.label_].append(ent.text) #normalize dates if ent.label_ == "DATE": normalized = normalize_data_time(ent.text) entities[ent.label_][-1] = normalized.strftime("%Y-%m-%d") if normalized else ent.text #infer the event(look for verbs or noun phrases) event = None for token in doc: if token.pos_ in ["VERB","NOUN"] and token.dep_ in["ROOT","dobj","nsubj"]: event = token.lemma_ # use the lemmatized form break if event : entities["EVENT"].append(event) #extract start time, end time, and calculate duration start_time =None end_time = None duration ="Unknown" #look for patterns like "from 3P to 4PM" or "3PM to 5PM" time_pattern = re.compile(r"(\d{1,2}(?::\d{2})?\d*(?:AM|PM|am|pm)?)\s*(?:to|until|-)\s*(\d{1,2}(?::\d{2})?\s*(?:AM|PM|am|pm)?)", re.IGNORECASE) match = time_pattern.search(text) if match: # found a rangle like "3PM to 4PM" start_str, end_str = match.groups() start_time = normalize_data_time(start_str) end_time = normalize_data_time(end_str) else: #if no range, use the first TIME entity as start time if entities["TIME"]: start_time = normalize_data_time(entities["TIME"][0]) #caulculate duration if both start and end times are available if start_time and end_time: duration = calculate_duration(start_time, end_time) # ensure start and end times are on the same day start_time_str = start_time.strftime("%H:%M") end_time_str = end_time.strftime("%H:%M") else: start_time_str =start_time.stftime("%H:%M") if start_time else "None" end_time_str = "None" # construct structured output result ={ "Event": entities["EVENT"][0] if entities["EVENT"] else "Unknown", "People": ",".join(entities["PERSON"]) if entities["PERSON"] else "None", "Date": ", ".join(entities["DATE"]) if entities["DATE"] else "None", "Start Time": start_time_str, "End Time": end_time_str, "Duration": duration, "Location":", ".join(entities["GPE"]) if entities["GPE"] else "None" } # format the output output =( f"Event: {result['Event']}\n" f"People: {result['People']}\n" f"Date: {result['Date']}\n" f"Start Time: {result['Start Time']}\n" f"End Time: {result['End Time']}\n" f"Duration: {result['Duration']}\n" f"Location: {result['Location']}\n" ) return output # gradio interface demo = gr.Interface( fn = process_text, inputs = "text", outputs = "text", title = "Text Normalization ", description = "Input text or OCR_extracted text to normalize and extract event with start time, end time, and duration" ) demo.launch()