Spaces:

Zoe911
/

spaCy-C

Sleeping

File size: 4,484 Bytes

import gradio as gr
import spacy
import dateutil.parser
import re
import os 
import sys 
from datetime import datetime, timedelta

# load the spaCy model
try:
    nlp =spacy.load("en_core_web_lg")
except OSError:
    print("Model not found. Downloading en_core_web_lg...")
    os.system(f"{sys.executable} -m spacy download en_core_web_lg --no-cache")
    nlp =spacy.load("en_core_web_lg")
    
# heler function :Normalize date and time expressions
def normalize_data_time(text):
    try:
        # use datutil to parse the time
        parsed_date = dateutil.parser.parse(text, fuzzy=True)
        return parsed_date.strftime("%Y-%m-%d %H:%M")
    except (ValueError, TypeError):
        return text #Return original text if parsing fails

 # Calculate duration between two times (in hours and minutes)
def calculate_duration(start_time, end_time):
    if start_time and end_time:
        delta = end_time - start_time
        total_minutes = int(delta.total_seconds() / 60)
        hours = total_minutes // 60
        minutes = total_minutes % 60
        return f"{hours}h {minutes}m"
    return "Unknown"


    
 #main function       
def process_text(text):
    doc = nlp(text)

    #extract named entities
    entities = {
        "PERSON": [], # name
        "DATE": [],  # date
        "TIME": [],  # times
         "GPE": [],# location
        "EVENT": [] # Events
    }
     # extract information from spaCy's named entity recognition
    for ent in doc.ents:
        if ent.label_ in entities:
            entities[ent.label_].append(ent.text)
          #normalize dates
        if ent.label_ == "DATE":
            normalized = normalize_data_time(ent.text)
            entities[ent.label_][-1] = normalized.strftime("%Y-%m-%d") if normalized else ent.text

        
        #infer the event(look for verbs or noun phrases)
        event = None
        for token in doc:
            if token.pos_ in ["VERB","NOUN"] and token.dep_ in["ROOT","dobj","nsubj"]:
                event = token.lemma_ # use the lemmatized form
                break
        if event :
            entities["EVENT"].append(event)
    #extract start time, end time, and calculate duration
    start_time =None
    end_time = None
    duration ="Unknown"

    #look for patterns like "from 3P to 4PM" or "3PM to 5PM"
    time_pattern = re.compile(r"(\d{1,2}(?::\d{2})?\d*(?:AM|PM|am|pm)?)\s*(?:to|until|-)\s*(\d{1,2}(?::\d{2})?\s*(?:AM|PM|am|pm)?)", re.IGNORECASE)
    match = time_pattern.search(text)

    if match:
        # found a rangle like "3PM to 4PM"
        start_str, end_str = match.groups()
        start_time = normalize_data_time(start_str)
        end_time = normalize_data_time(end_str)
    else:
        #if no range, use the first TIME entity as start time 
        if entities["TIME"]:
            start_time = normalize_data_time(entities["TIME"][0])

      #caulculate duration if both start and end times are available
    if start_time and end_time:
        duration = calculate_duration(start_time, end_time)
        # ensure start and end times are on the same day
        start_time_str = start_time.strftime("%H:%M")
        end_time_str = end_time.strftime("%H:%M")
        
        
    else:
        start_time_str =start_time.stftime("%H:%M") if start_time else "None"
        end_time_str = "None"
    
    
    # construct structured output
    result ={
        "Event": entities["EVENT"][0] if entities["EVENT"] else "Unknown",
        "People": ",".join(entities["PERSON"]) if entities["PERSON"] else "None",
        "Date": ", ".join(entities["DATE"]) if entities["DATE"] else "None",
        "Start Time": start_time_str,
        "End Time": end_time_str,
        "Duration": duration,
        "Location":", ".join(entities["GPE"]) if entities["GPE"] else "None"
    }



# format the output

    output =(
        f"Event: {result['Event']}\n"
        f"People: {result['People']}\n"
        f"Date: {result['Date']}\n"
        f"Start Time: {result['Start Time']}\n"
        f"End Time: {result['End Time']}\n"
        f"Duration: {result['Duration']}\n"
        f"Location: {result['Location']}\n"
    )
    return output


    # gradio interface
    demo = gr.Interface(
        fn = process_text,
        inputs = "text",
        outputs = "text",
        title = "Text Normalization ",
        description = "Input text or OCR_extracted text to normalize and extract event with start time, end time, and duration"
    )
            

    demo.launch()