Spaces:

Zoe911
/

spaCy-C

Sleeping

App Files Files Community

Zoe911 commited on Apr 17

Commit

36e8407

verified ·

1 Parent(s): 3b8de2e

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -36

app.py CHANGED Viewed

@@ -23,13 +23,12 @@ nlp = load_spacy_model()
 # Helper function: Normalize date and time expressions
 def normalize_data_time(text):
     try:
-        # Use dateutil to parse the time
         parsed_date = dateutil.parser.parse(text, fuzzy=True)
-        return parsed_date  # Return datetime object instead of string
     except (ValueError, TypeError):
-        return None  # Return None if parsing fails
-# Calculate duration between two times (in hours and minutes)
 def calculate_duration(start_time, end_time):
     if isinstance(start_time, datetime) and isinstance(end_time, datetime):
         delta = end_time - start_time
@@ -39,68 +38,58 @@ def calculate_duration(start_time, end_time):
         return f"{hours}h {minutes}m"
     return "Unknown"
-# Main function
 def process_text(text):
     doc = nlp(text)
-    # Extract named entities
     entities = {
-        "PERSON": [],  # Names
-        "DATE": [],    # Dates
-        "TIME": [],    # Times
-        "GPE": [],     # Locations
-        "EVENT": []    # Events
     }
-    # Extract information from spaCy's named entity recognition
     for ent in doc.ents:
         if ent.label_ in entities:
             entities[ent.label_].append(ent.text)
-        # Normalize dates
         if ent.label_ == "DATE":
             normalized = normalize_data_time(ent.text)
             entities[ent.label_][-1] = normalized.strftime("%Y-%m-%d") if normalized else ent.text
-    # Infer the event (look for verbs or noun phrases)
     event = None
     for token in doc:
         if token.pos_ in ["VERB", "NOUN"] and token.dep_ in ["ROOT", "dobj", "nsubj"]:
-            event = token.lemma_  # Use the lemmatized form
             break
     if event:
         entities["EVENT"].append(event)
-    # Extract start time, end time, and calculate duration
     start_time = None
     end_time = None
     duration = "Unknown"
-    # Look for patterns like "from 3PM to 4PM" or "3PM to 5PM"
-    time_pattern = re.compile(r"(\d{1,2}(?::\d{2})?\s*(?:AM|PM|am|pm)?)\s*(?:to|until|-)\s*(\d{1,2}(?::\d{2})?\s*(?:AM|PM|am|pm)?)", re.IGNORECASE)
     match = time_pattern.search(text)
     if match:
-        # Found a range like "3PM to 4PM"
         start_str, end_str = match.groups()
         start_time = normalize_data_time(start_str)
         end_time = normalize_data_time(end_str)
     else:
-         # If no range, use the first and second TIME entities
-        if len(entities["TIME"]) >= 1:
-            start_time = normalize_data_time(entities["TIME"][0])
-        if len(entities["TIME"]) >= 2:
-            end_time = normalize_data_time(entities["TIME"][1])
-    # Calculate duration if both start and end times are available
     if start_time and end_time:
         duration = calculate_duration(start_time, end_time)
-        start_time_str = start_time.strftime("%H:%M")
-        end_time_str = end_time.strftime("%H:%M")
-    else:
-        start_time_str = start_time.strftime("%H:%M") if start_time else "None"
-        end_time_str = "None"
-    # Construct structured output
     result = {
         "Event": entities["EVENT"][0] if entities["EVENT"] else "Unknown",
         "People": ", ".join(entities["PERSON"]) if entities["PERSON"] else "None",
@@ -111,7 +100,6 @@ def process_text(text):
         "Location": ", ".join(entities["GPE"]) if entities["GPE"] else "None"
     }
-    # Format the output
     output = (
         f"Event: {result['Event']}\n"
         f"People: {result['People']}\n"
@@ -123,7 +111,7 @@ def process_text(text):
     )
     return output
-# Gradio interface (moved outside of process_text)
 demo = gr.Interface(
     fn=process_text,
     inputs="text",
@@ -132,8 +120,6 @@ demo = gr.Interface(
     description="Input text or OCR-extracted text to normalize and extract event with start time, end time, and duration"
 )
-# Launch the app
 if __name__ == "__main__":
     print("Launching Gradio application...")
     demo.launch(server_name="0.0.0.0", server_port=7860)

 # Helper function: Normalize date and time expressions
 def normalize_data_time(text):
     try:
         parsed_date = dateutil.parser.parse(text, fuzzy=True)
+        return parsed_date
     except (ValueError, TypeError):
+        return None
+# Calculate duration between two times
 def calculate_duration(start_time, end_time):
     if isinstance(start_time, datetime) and isinstance(end_time, datetime):
         delta = end_time - start_time
         return f"{hours}h {minutes}m"
     return "Unknown"
+# Main processing function
 def process_text(text):
     doc = nlp(text)
     entities = {
+        "PERSON": [],
+        "DATE": [],
+        "TIME": [],
+        "GPE": [],
+        "EVENT": []
     }
     for ent in doc.ents:
         if ent.label_ in entities:
             entities[ent.label_].append(ent.text)
         if ent.label_ == "DATE":
             normalized = normalize_data_time(ent.text)
             entities[ent.label_][-1] = normalized.strftime("%Y-%m-%d") if normalized else ent.text
     event = None
     for token in doc:
         if token.pos_ in ["VERB", "NOUN"] and token.dep_ in ["ROOT", "dobj", "nsubj"]:
+            event = token.lemma_
             break
     if event:
         entities["EVENT"].append(event)
     start_time = None
     end_time = None
     duration = "Unknown"
+    # Try to match range
+    time_pattern = re.compile(r"(\d{1,2}(?::\d{2})?\s*(?:AM|PM|am|pm)?)\s*(?:to|until|-|–)\s*(\d{1,2}(?::\d{2})?\s*(?:AM|PM|am|pm)?)", re.IGNORECASE)
     match = time_pattern.search(text)
     if match:
         start_str, end_str = match.groups()
         start_time = normalize_data_time(start_str)
         end_time = normalize_data_time(end_str)
     else:
+        # Fallback: extract multiple AM/PM time entries
+        ampm_matches = re.findall(r"\d{1,2}(?::\d{2})?\s*[AaPp][Mm]", text)
+        if len(ampm_matches) >= 1:
+            start_time = normalize_data_time(ampm_matches[0])
+        if len(ampm_matches) >= 2:
+            end_time = normalize_data_time(ampm_matches[1])
+    start_time_str = start_time.strftime("%H:%M") if start_time else "None"
+    end_time_str = end_time.strftime("%H:%M") if end_time else "None"
     if start_time and end_time:
         duration = calculate_duration(start_time, end_time)
     result = {
         "Event": entities["EVENT"][0] if entities["EVENT"] else "Unknown",
         "People": ", ".join(entities["PERSON"]) if entities["PERSON"] else "None",
         "Location": ", ".join(entities["GPE"]) if entities["GPE"] else "None"
     }
     output = (
         f"Event: {result['Event']}\n"
         f"People: {result['People']}\n"
     )
     return output
+# Gradio interface
 demo = gr.Interface(
     fn=process_text,
     inputs="text",
     description="Input text or OCR-extracted text to normalize and extract event with start time, end time, and duration"
 )
 if __name__ == "__main__":
     print("Launching Gradio application...")
     demo.launch(server_name="0.0.0.0", server_port=7860)