import gradio as gr import spacy import dateutil.parser import re import os import sys from datetime import datetime, timedelta # Load the spaCy model def load_spacy_model(): try: nlp = spacy.load("en_core_web_sm") print("Model loaded successfully.") except OSError: print("Model not found. Downloading en_core_web_sm...") os.system(f"{sys.executable} -m spacy download en_core_web_sm --no-cache") nlp = spacy.load("en_core_web_sm") print("Model downloaded and loaded successfully.") return nlp nlp = load_spacy_model() # Helper function: Normalize date and time expressions def normalize_data_time(text): try: parsed_date = dateutil.parser.parse(text, fuzzy=True) return parsed_date except (ValueError, TypeError): return None # Calculate duration between two times def calculate_duration(start_time, end_time): if isinstance(start_time, datetime) and isinstance(end_time, datetime): delta = end_time - start_time total_minutes = int(delta.total_seconds() / 60) hours = total_minutes // 60 minutes = total_minutes % 60 return f"{hours}h {minutes}m" return "Unknown" # Main processing function def process_text(text): doc = nlp(text) entities = { "PERSON": [], "DATE": [], "TIME": [], "GPE": [], "EVENT": [] } for ent in doc.ents: if ent.label_ in entities: entities[ent.label_].append(ent.text) if ent.label_ == "DATE": normalized = normalize_data_time(ent.text) entities[ent.label_][-1] = normalized.strftime("%Y-%m-%d") if normalized else ent.text event = None for token in doc: if token.pos_ in ["VERB", "NOUN"] and token.dep_ in ["ROOT", "dobj", "nsubj"]: event = token.lemma_ break if event: entities["EVENT"].append(event) start_time = None end_time = None duration = "Unknown" # Try to match range time_pattern = re.compile(r"(\d{1,2}(?::\d{2})?\s*(?:AM|PM|am|pm)?)\s*(?:to|until|-|–)\s*(\d{1,2}(?::\d{2})?\s*(?:AM|PM|am|pm)?)", re.IGNORECASE) match = time_pattern.search(text) if match: start_str, end_str = match.groups() start_time = normalize_data_time(start_str) end_time = normalize_data_time(end_str) else: # Fallback: extract multiple AM/PM time entries ampm_matches = re.findall(r"\d{1,2}(?::\d{2})?\s*[AaPp][Mm]", text) if len(ampm_matches) >= 1: start_time = normalize_data_time(ampm_matches[0]) if len(ampm_matches) >= 2: end_time = normalize_data_time(ampm_matches[1]) start_time_str = start_time.strftime("%H:%M") if start_time else "None" end_time_str = end_time.strftime("%H:%M") if end_time else "None" if start_time and end_time: duration = calculate_duration(start_time, end_time) result = { "Event": entities["EVENT"][0] if entities["EVENT"] else "Unknown", "People": ", ".join(entities["PERSON"]) if entities["PERSON"] else "None", "Date": ", ".join(entities["DATE"]) if entities["DATE"] else "None", "Start Time": start_time_str, "End Time": end_time_str, "Duration": duration, "Location": ", ".join(entities["GPE"]) if entities["GPE"] else "None" } output = ( f"Event: {result['Event']}\n" f"People: {result['People']}\n" f"Date: {result['Date']}\n" f"Start Time: {result['Start Time']}\n" f"End Time: {result['End Time']}\n" f"Duration: {result['Duration']}\n" f"Location: {result['Location']}\n" ) return output # Gradio interface demo = gr.Interface( fn=process_text, inputs="text", outputs="text", title="Text Normalization", description="Input text or OCR-extracted text to normalize and extract event with start time, end time, and duration" ) if __name__ == "__main__": print("Launching Gradio application...") demo.launch(server_name="0.0.0.0", server_port=7860)