Sem

Sleeping

App Files Files Community

Sem / app.py

khushidhar1210

with pegasus

3a1a521 verified 9 months ago

raw

history blame contribute delete

6.61 kB

	import streamlit as st
	import geopandas as gpd
	import sqlite3
	import pandas as pd
	import os
	import torch
	from transformers import pipeline
	from sentence_transformers import SentenceTransformer
	import faiss
	import numpy as np

	# Set the SHAPE_RESTORE_SHX configuration option to YES
	os.environ['SHAPE_RESTORE_SHX'] = 'YES'

	# Set device to GPU if available, otherwise CPU
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	st.write(f"Using device: {device}")

	# Connect to SQLite database
	conn = sqlite3.connect('NY.db')
	cursor = conn.cursor()

	# Create table for floodland data
	cursor.execute('''
	CREATE TABLE IF NOT EXISTS floodlands (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	DFIRM_ID TEXT,
	VERSION_ID TEXT,
	FLD_AR_ID TEXT,
	STUDY_TYP TEXT,
	FLD_ZONE TEXT,
	ZONE_SUBTY TEXT,
	SFHA_TF TEXT,
	STATIC_BFE FLOAT,
	V_DATUM TEXT,
	DEPTH FLOAT,
	LEN_UNIT TEXT,
	VELOCITY FLOAT,
	VEL_UNIT TEXT,
	AR_REVERT TEXT,
	AR_SUBTRV TEXT,
	BFE_REVERT FLOAT,
	DEP_REVERT FLOAT,
	DUAL_ZONE TEXT,
	SOURCE_CIT TEXT,
	geometry TEXT,
	acreage FLOAT,
	usable_area FLOAT
	)
	''')
	conn.commit()

	# Load and process the shapefile
	shapefile_path = os.path.join(os.path.dirname(__file__), 'S_FLD_HAZ_AR.shp')
	gdf = gpd.read_file(shapefile_path)

	# Check the initial CRS
	st.write("Initial CRS:", gdf.crs)

	# If the CRS is None, set it to WGS84 (EPSG:4326), which is common for FEMA shapefiles
	if gdf.crs is None:
	gdf.set_crs(epsg=4326, inplace=True)
	st.write("CRS was missing; set to EPSG:4326 (WGS84).")

	# Validate geometries
	gdf['geometry'] = gdf['geometry'].apply(lambda geom: geom if geom.is_valid else None)
	if gdf['geometry'].isnull().any():
	st.write(f"Found {gdf['geometry'].isnull().sum()} invalid or null geometries. Dropping these rows.")
	gdf = gdf.dropna(subset=['geometry'])

	# Reproject to UTM Zone 18N (EPSG:32618) for accurate area calculations
	gdf = gdf.to_crs(epsg=32618)
	st.write("CRS after reprojection:", gdf.crs)

	# Calculate acreage (1 square meter = 0.000247105 acres)
	gdf['acreage'] = gdf.geometry.area * 0.000247105

	# Calculate usable area (excluding flood-prone zones)
	flood_prone_zones = ['A', 'AE', 'AH', 'AO', 'VE']
	gdf['usable_area'] = gdf.apply(
	lambda row: row['acreage'] if row['FLD_ZONE'] not in flood_prone_zones else 0, axis=1
	)

	# Convert geometry to WKT for storage in SQLite
	gdf['wkt_geometry'] = gdf['geometry'].apply(lambda geom: geom.wkt)

	# Create a new DataFrame without the geometry column for SQLite storage
	gdf_for_sql = gdf.drop(columns=['geometry'])

	# Store in SQLite
	rows_inserted = gdf_for_sql.to_sql('floodlands', conn, if_exists='replace', index=False)
	st.write(f"Inserted {rows_inserted} rows into the database.")

	# Close the database connection
	conn.close()

	# Load sentence transformer for embedding
	embedder = SentenceTransformer('all-MiniLM-L6-v2')

	# Load summarization model (using google/pegasus-xsum, which is publicly accessible)
	summarizer = pipeline("summarization", model="google/pegasus-xsum", device=0 if torch.cuda.is_available() else -1)

	# Create a text representation of each floodland record for embedding
	gdf['text'] = gdf.apply(
	lambda row: f"Floodland area with FLD_ZONE: {row['FLD_ZONE']}, ZONE_SUBTY: {row['ZONE_SUBTY']}, "
	f"acreage: {row['acreage']:.2f} acres, usable area: {row['usable_area']:.2f} acres",
	axis=1
	)

	# Embed the text representations
	embeddings = embedder.encode(gdf['text'].tolist(), show_progress_bar=True)

	# Create a FAISS index for retrieval
	d = embeddings.shape[1] # Dimension of embeddings
	index = faiss.IndexFlatL2(d)
	index.add(embeddings)

	# Store the embeddings
	gdf['embedding'] = list(embeddings)

	# RAG-based summarization function (without prompt)
	def rag_summarize(query, gdf, index, k=5):
	# Embed the query
	query_embedding = embedder.encode([query])[0]

	# Retrieve top-k relevant documents
	distances, indices = index.search(np.array([query_embedding]), k)
	retrieved_docs = gdf.iloc[indices[0]]

	# Aggregate acreage and usable area from retrieved documents
	total_acreage = retrieved_docs['acreage'].sum()
	usable_acreage = retrieved_docs['usable_area'].sum()

	# Create a simplified narrative context
	context = (f"The floodlands in {query} cover a total of {total_acreage:.2f} acres, with {usable_acreage:.2f} acres usable for development. "
	f"A significant portion, spanning {retrieved_docs.iloc[0]['acreage']:.2f} acres, is classified as {retrieved_docs.iloc[0]['FLD_ZONE']} flood zone, "
	f"indicating potential flood risks that could impact development projects.")

	# Debug: Display the narrative context
	st.write("Narrative context for summarization:")
	st.write(context)

	# Generate summary without a prompt
	try:
	summary_output = summarizer(context, max_length=150, min_length=100, do_sample=False)
	st.write("Raw summarizer output:", summary_output)
	if isinstance(summary_output, list) and len(summary_output) > 0 and 'summary_text' in summary_output[0]:
	summary = summary_output[0]['summary_text']
	else:
	raise ValueError("Unexpected output format from summarizer.")
	except Exception as e:
	st.write(f"Error in summarization: {e}")
	# Fallback: Generate a basic summary manually
	summary = (f"The floodlands in {query} cover a total of {total_acreage:.2f} acres. "
	f"Only {usable_acreage:.2f} acres are usable for development due to flood risks. "
	f"A significant portion, including {retrieved_docs.iloc[0]['acreage']:.2f} acres, falls under the {retrieved_docs.iloc[0]['FLD_ZONE']} flood zone. "
	f"This indicates a high risk of flooding, which could impact power plant projects. "
	f"Overall, the limited usable land poses challenges for development in this region.")

	return summary

	# Streamlit interface
	st.title("Floodland Summary Bot")

	# Input field for the user to enter a location
	user_input = st.text_input("Input:", placeholder="Enter state name (e.g., Chicago)")

	# Submit button
	if st.button("Submit"):
	if user_input:
	st.write("Hi, How can I help you today?")
	st.write(f"User input: {user_input}")

	# Generate summary
	summary = rag_summarize(user_input, gdf, index, k=5)
	st.write(summary)
	else:
	st.write("Please enter a location to proceed.")