Spaces:

Zoe911
/

spaCy-C

Sleeping

File size: 4,112 Bytes

a842478
cef0cb9
1ce3e50
2dea340
0259b0b
 
1ce3e50
 
0259b0b
b9a25ba
356ac0f
0259b0b
 
356ac0f
 
 
0259b0b
 
356ac0f
 
0259b0b
 
 
2dea340
 
 
36e8407
2dea340
36e8407
2dea340
36e8407
2dea340
0259b0b
2dea340
 
 
 
 
 
 
36e8407
cef0cb9
 
2dea340
 
36e8407
 
 
 
 
2dea340
0259b0b
2dea340
 
 
 
 
 
 
0259b0b
 
 
36e8407
0259b0b
 
 
 
 
2dea340
0259b0b
2dea340
36e8407
 
2dea340
 
 
 
 
 
 
36e8407
 
 
 
 
 
 
 
 
2dea340
 
0259b0b
 
2dea340
0259b0b
2dea340
 
 
 
0259b0b
2dea340
 
0259b0b
2dea340
 
 
 
 
 
 
 
 
 
36e8407
0259b0b

import gradio as gr
import spacy
import dateutil.parser
import re
import os
import sys
from datetime import datetime, timedelta

# Load the spaCy model
def load_spacy_model():
    try:
        nlp = spacy.load("en_core_web_sm")
        print("Model loaded successfully.")
    except OSError:
        print("Model not found. Downloading en_core_web_sm...")
        os.system(f"{sys.executable} -m spacy download en_core_web_sm --no-cache")
        nlp = spacy.load("en_core_web_sm")
        print("Model downloaded and loaded successfully.")
    return nlp

nlp = load_spacy_model()

# Helper function: Normalize date and time expressions
def normalize_data_time(text):
    try:
        parsed_date = dateutil.parser.parse(text, fuzzy=True)
        return parsed_date
    except (ValueError, TypeError):
        return None

# Calculate duration between two times
def calculate_duration(start_time, end_time):
    if isinstance(start_time, datetime) and isinstance(end_time, datetime):
        delta = end_time - start_time
        total_minutes = int(delta.total_seconds() / 60)
        hours = total_minutes // 60
        minutes = total_minutes % 60
        return f"{hours}h {minutes}m"
    return "Unknown"

# Main processing function
def process_text(text):
    doc = nlp(text)

    entities = {
        "PERSON": [],
        "DATE": [],
        "TIME": [],
        "GPE": [],
        "EVENT": []
    }

    for ent in doc.ents:
        if ent.label_ in entities:
            entities[ent.label_].append(ent.text)
        if ent.label_ == "DATE":
            normalized = normalize_data_time(ent.text)
            entities[ent.label_][-1] = normalized.strftime("%Y-%m-%d") if normalized else ent.text

    event = None
    for token in doc:
        if token.pos_ in ["VERB", "NOUN"] and token.dep_ in ["ROOT", "dobj", "nsubj"]:
            event = token.lemma_
            break
    if event:
        entities["EVENT"].append(event)

    start_time = None
    end_time = None
    duration = "Unknown"

    # Try to match range
    time_pattern = re.compile(r"(\d{1,2}(?::\d{2})?\s*(?:AM|PM|am|pm)?)\s*(?:to|until|-|–)\s*(\d{1,2}(?::\d{2})?\s*(?:AM|PM|am|pm)?)", re.IGNORECASE)
    match = time_pattern.search(text)

    if match:
        start_str, end_str = match.groups()
        start_time = normalize_data_time(start_str)
        end_time = normalize_data_time(end_str)
    else:
        # Fallback: extract multiple AM/PM time entries
        ampm_matches = re.findall(r"\d{1,2}(?::\d{2})?\s*[AaPp][Mm]", text)
        if len(ampm_matches) >= 1:
            start_time = normalize_data_time(ampm_matches[0])
        if len(ampm_matches) >= 2:
            end_time = normalize_data_time(ampm_matches[1])

    start_time_str = start_time.strftime("%H:%M") if start_time else "None"
    end_time_str = end_time.strftime("%H:%M") if end_time else "None"
    if start_time and end_time:
        duration = calculate_duration(start_time, end_time)

    result = {
        "Event": entities["EVENT"][0] if entities["EVENT"] else "Unknown",
        "People": ", ".join(entities["PERSON"]) if entities["PERSON"] else "None",
        "Date": ", ".join(entities["DATE"]) if entities["DATE"] else "None",
        "Start Time": start_time_str,
        "End Time": end_time_str,
        "Duration": duration,
        "Location": ", ".join(entities["GPE"]) if entities["GPE"] else "None"
    }

    output = (
        f"Event: {result['Event']}\n"
        f"People: {result['People']}\n"
        f"Date: {result['Date']}\n"
        f"Start Time: {result['Start Time']}\n"
        f"End Time: {result['End Time']}\n"
        f"Duration: {result['Duration']}\n"
        f"Location: {result['Location']}\n"
    )
    return output

# Gradio interface
demo = gr.Interface(
    fn=process_text,
    inputs="text",
    outputs="text",
    title="Text Normalization",
    description="Input text or OCR-extracted text to normalize and extract event with start time, end time, and duration"
)

if __name__ == "__main__":
    print("Launching Gradio application...")
    demo.launch(server_name="0.0.0.0", server_port=7860)