File size: 4,484 Bytes
a842478
cef0cb9
1ce3e50
2dea340
1ce3e50
 
 
 
2dea340
84dfc28
 
 
1ce3e50
 
84dfc28
 
2dea340
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a842478
2dea340
 
cef0cb9
 
2dea340
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cef0cb9
2dea340
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a842478
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import gradio as gr
import spacy
import dateutil.parser
import re
import os 
import sys 
from datetime import datetime, timedelta

# load the spaCy model
try:
    nlp =spacy.load("en_core_web_lg")
except OSError:
    print("Model not found. Downloading en_core_web_lg...")
    os.system(f"{sys.executable} -m spacy download en_core_web_lg --no-cache")
    nlp =spacy.load("en_core_web_lg")
    
# heler function :Normalize date and time expressions
def normalize_data_time(text):
    try:
        # use datutil to parse the time
        parsed_date = dateutil.parser.parse(text, fuzzy=True)
        return parsed_date.strftime("%Y-%m-%d %H:%M")
    except (ValueError, TypeError):
        return text #Return original text if parsing fails

 # Calculate duration between two times (in hours and minutes)
def calculate_duration(start_time, end_time):
    if start_time and end_time:
        delta = end_time - start_time
        total_minutes = int(delta.total_seconds() / 60)
        hours = total_minutes // 60
        minutes = total_minutes % 60
        return f"{hours}h {minutes}m"
    return "Unknown"


    
 #main function       
def process_text(text):
    doc = nlp(text)

    #extract named entities
    entities = {
        "PERSON": [], # name
        "DATE": [],  # date
        "TIME": [],  # times
         "GPE": [],# location
        "EVENT": [] # Events
    }
     # extract information from spaCy's named entity recognition
    for ent in doc.ents:
        if ent.label_ in entities:
            entities[ent.label_].append(ent.text)
          #normalize dates
        if ent.label_ == "DATE":
            normalized = normalize_data_time(ent.text)
            entities[ent.label_][-1] = normalized.strftime("%Y-%m-%d") if normalized else ent.text

        
        #infer the event(look for verbs or noun phrases)
        event = None
        for token in doc:
            if token.pos_ in ["VERB","NOUN"] and token.dep_ in["ROOT","dobj","nsubj"]:
                event = token.lemma_ # use the lemmatized form
                break
        if event :
            entities["EVENT"].append(event)
    #extract start time, end time, and calculate duration
    start_time =None
    end_time = None
    duration ="Unknown"

    #look for patterns like "from 3P to 4PM" or "3PM to 5PM"
    time_pattern = re.compile(r"(\d{1,2}(?::\d{2})?\d*(?:AM|PM|am|pm)?)\s*(?:to|until|-)\s*(\d{1,2}(?::\d{2})?\s*(?:AM|PM|am|pm)?)", re.IGNORECASE)
    match = time_pattern.search(text)

    if match:
        # found a rangle like "3PM to 4PM"
        start_str, end_str = match.groups()
        start_time = normalize_data_time(start_str)
        end_time = normalize_data_time(end_str)
    else:
        #if no range, use the first TIME entity as start time 
        if entities["TIME"]:
            start_time = normalize_data_time(entities["TIME"][0])

      #caulculate duration if both start and end times are available
    if start_time and end_time:
        duration = calculate_duration(start_time, end_time)
        # ensure start and end times are on the same day
        start_time_str = start_time.strftime("%H:%M")
        end_time_str = end_time.strftime("%H:%M")
        
        
    else:
        start_time_str =start_time.stftime("%H:%M") if start_time else "None"
        end_time_str = "None"
    
    
    # construct structured output
    result ={
        "Event": entities["EVENT"][0] if entities["EVENT"] else "Unknown",
        "People": ",".join(entities["PERSON"]) if entities["PERSON"] else "None",
        "Date": ", ".join(entities["DATE"]) if entities["DATE"] else "None",
        "Start Time": start_time_str,
        "End Time": end_time_str,
        "Duration": duration,
        "Location":", ".join(entities["GPE"]) if entities["GPE"] else "None"
    }



# format the output

    output =(
        f"Event: {result['Event']}\n"
        f"People: {result['People']}\n"
        f"Date: {result['Date']}\n"
        f"Start Time: {result['Start Time']}\n"
        f"End Time: {result['End Time']}\n"
        f"Duration: {result['Duration']}\n"
        f"Location: {result['Location']}\n"
    )
    return output


    # gradio interface
    demo = gr.Interface(
        fn = process_text,
        inputs = "text",
        outputs = "text",
        title = "Text Normalization ",
        description = "Input text or OCR_extracted text to normalize and extract event with start time, end time, and duration"
    )
            

    demo.launch()