Spaces:

aaporosh
/

SmartPDF_Q_A

Sleeping

App Files Files Community

SmartPDF_Q_A / app.py

aaporosh

Create app.py

f513b53 verified 24 days ago

raw

history blame

4.87 kB

	import streamlit as st
	import os
	import logging
	from io import BytesIO
	from PyPDF2 import PdfReader
	from langchain.text_splitter import CharacterTextSplitter
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_community.vectorstores import FAISS
	from langchain.prompts import PromptTemplate
	from langchain.chains.question_answering import load_qa_chain
	from langchain_community.llms import HuggingFaceHub
	from transformers import pipeline # For fallback if Hub fails

	# Set up logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Check API token
	if "HUGGINGFACEHUB_API_TOKEN" not in os.environ:
	st.error("HUGGINGFACEHUB_API_TOKEN not set in secrets. Add it in Space settings.")
	st.stop()

	try:
	# Function to process PDF
	def process_pdf(uploaded_file):
	try:
	logger.info("Starting PDF processing")
	pdf_reader = PdfReader(BytesIO(uploaded_file.getvalue()))
	text = ""
	for page in pdf_reader.pages:
	extracted = page.extract_text()
	if extracted:
	text += extracted + "\n"

	if not text:
	raise ValueError("No text extracted from PDF.")

	# Chunk text (increased overlap for better context)
	text_splitter = CharacterTextSplitter(separator="\n", chunk_size=800, chunk_overlap=200, length_function=len)
	chunks = text_splitter.split_text(text)

	# Embeddings (light model)
	embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'})

	# Vector store
	vector_store = FAISS.from_texts(chunks, embedding=embeddings)
	logger.info("PDF processed successfully")
	return vector_store
	except Exception as e:
	logger.error(f"PDF processing error: {str(e)}")
	st.error(f"Error processing PDF: {str(e)}")
	return None

	# Function to answer questions
	def answer_question(vector_store, query):
	try:
	logger.info(f"Answering query: {query}")
	# Lighter LLM via pipeline for faster CPU inference
	qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")

	# Retrieve top chunks
	docs = vector_store.similarity_search(query, k=3)
	context = "\n".join([doc.page_content for doc in docs])

	# Prompt
	prompt = f"Use this context to answer concisely: {context}\nQuestion: {query}\nAnswer:"
	response = qa_pipeline(prompt, max_length=256, num_return_sequences=1)[0]['generated_text']

	logger.info("Answer generated")
	return response.strip()
	except Exception as e:
	logger.error(f"Answer generation error: {str(e)}")
	st.error(f"Error answering: {str(e)}")
	return "Unable to generate answer."

	# Streamlit UI with chat history
	st.title("Smart PDF Q&A")
	st.write("Upload a PDF and ask questions! Chat history is preserved.")

	# Initialize session state
	if "messages" not in st.session_state:
	st.session_state.messages = []
	if "vector_store" not in st.session_state:
	st.session_state.vector_store = None

	# PDF upload and process
	uploaded_file = st.file_uploader("Upload PDF", type="pdf")
	if uploaded_file:
	if st.button("Process PDF"):
	with st.spinner("Processing..."):
	vector_store = process_pdf(uploaded_file)
	if vector_store:
	st.session_state.vector_store = vector_store
	st.success("PDF ready! Ask away.")
	st.session_state.messages = [] # Reset chat on new PDF

	# Display chat history
	for message in st.session_state.messages:
	with st.chat_message(message["role"]):
	st.markdown(message["content"])

	# Question input
	if st.session_state.vector_store:
	if prompt := st.chat_input("Ask a question:"):
	# Add user message
	st.session_state.messages.append({"role": "user", "content": prompt})
	with st.chat_message("user"):
	st.markdown(prompt)

	# Generate answer
	with st.chat_message("assistant"):
	with st.spinner("Thinking..."):
	answer = answer_question(st.session_state.vector_store, prompt)
	st.markdown(answer)
	st.session_state.messages.append({"role": "assistant", "content": answer})

	except Exception as e:
	logger.error(f"App initialization failed: {str(e)}")
	st.error(f"Initialization error: {str(e)}. Check logs or try factory reset.")