Spaces:

Zoe911
/

spaCy-C

Sleeping

App Files Files Community

spaCy-C / app.py

Zoe911

Update app.py

1ce3e50 verified 9 months ago

raw

history blame

4.48 kB

	import gradio as gr
	import spacy
	import dateutil.parser
	import re
	import os
	import sys
	from datetime import datetime, timedelta

	# load the spaCy model
	try:
	nlp =spacy.load("en_core_web_lg")
	except OSError:
	print("Model not found. Downloading en_core_web_lg...")
	os.system(f"{sys.executable} -m spacy download en_core_web_lg --no-cache")
	nlp =spacy.load("en_core_web_lg")

	# heler function :Normalize date and time expressions
	def normalize_data_time(text):
	try:
	# use datutil to parse the time
	parsed_date = dateutil.parser.parse(text, fuzzy=True)
	return parsed_date.strftime("%Y-%m-%d %H:%M")
	except (ValueError, TypeError):
	return text #Return original text if parsing fails

	# Calculate duration between two times (in hours and minutes)
	def calculate_duration(start_time, end_time):
	if start_time and end_time:
	delta = end_time - start_time
	total_minutes = int(delta.total_seconds() / 60)
	hours = total_minutes // 60
	minutes = total_minutes % 60
	return f"{hours}h {minutes}m"
	return "Unknown"



	#main function
	def process_text(text):
	doc = nlp(text)

	#extract named entities
	entities = {
	"PERSON": [], # name
	"DATE": [], # date
	"TIME": [], # times
	"GPE": [],# location
	"EVENT": [] # Events
	}
	# extract information from spaCy's named entity recognition
	for ent in doc.ents:
	if ent.label_ in entities:
	entities[ent.label_].append(ent.text)
	#normalize dates
	if ent.label_ == "DATE":
	normalized = normalize_data_time(ent.text)
	entities[ent.label_][-1] = normalized.strftime("%Y-%m-%d") if normalized else ent.text


	#infer the event(look for verbs or noun phrases)
	event = None
	for token in doc:
	if token.pos_ in ["VERB","NOUN"] and token.dep_ in["ROOT","dobj","nsubj"]:
	event = token.lemma_ # use the lemmatized form
	break
	if event :
	entities["EVENT"].append(event)
	#extract start time, end time, and calculate duration
	start_time =None
	end_time = None
	duration ="Unknown"

	#look for patterns like "from 3P to 4PM" or "3PM to 5PM"
	time_pattern = re.compile(r"(\d{1,2}(?::\d{2})?\d(?:AM\|PM\|am\|pm)?)\s(?:to\|until\|-)\s(\d{1,2}(?::\d{2})?\s(?:AM\|PM\|am\|pm)?)", re.IGNORECASE)
	match = time_pattern.search(text)

	if match:
	# found a rangle like "3PM to 4PM"
	start_str, end_str = match.groups()
	start_time = normalize_data_time(start_str)
	end_time = normalize_data_time(end_str)
	else:
	#if no range, use the first TIME entity as start time
	if entities["TIME"]:
	start_time = normalize_data_time(entities["TIME"][0])

	#caulculate duration if both start and end times are available
	if start_time and end_time:
	duration = calculate_duration(start_time, end_time)
	# ensure start and end times are on the same day
	start_time_str = start_time.strftime("%H:%M")
	end_time_str = end_time.strftime("%H:%M")


	else:
	start_time_str =start_time.stftime("%H:%M") if start_time else "None"
	end_time_str = "None"


	# construct structured output
	result ={
	"Event": entities["EVENT"][0] if entities["EVENT"] else "Unknown",
	"People": ",".join(entities["PERSON"]) if entities["PERSON"] else "None",
	"Date": ", ".join(entities["DATE"]) if entities["DATE"] else "None",
	"Start Time": start_time_str,
	"End Time": end_time_str,
	"Duration": duration,
	"Location":", ".join(entities["GPE"]) if entities["GPE"] else "None"
	}



	# format the output

	output =(
	f"Event: {result['Event']}\n"
	f"People: {result['People']}\n"
	f"Date: {result['Date']}\n"
	f"Start Time: {result['Start Time']}\n"
	f"End Time: {result['End Time']}\n"
	f"Duration: {result['Duration']}\n"
	f"Location: {result['Location']}\n"
	)
	return output


	# gradio interface
	demo = gr.Interface(
	fn = process_text,
	inputs = "text",
	outputs = "text",
	title = "Text Normalization ",
	description = "Input text or OCR_extracted text to normalize and extract event with start time, end time, and duration"
	)


	demo.launch()