import streamlit as st import os import logging from io import BytesIO from PyPDF2 import PdfReader from langchain.text_splitter import CharacterTextSplitter from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from langchain.prompts import PromptTemplate from langchain.chains.question_answering import load_qa_chain from langchain_community.llms import HuggingFaceHub from transformers import pipeline # For fallback if Hub fails # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Check API token if "HUGGINGFACEHUB_API_TOKEN" not in os.environ: st.error("HUGGINGFACEHUB_API_TOKEN not set in secrets. Add it in Space settings.") st.stop() try: # Function to process PDF def process_pdf(uploaded_file): try: logger.info("Starting PDF processing") pdf_reader = PdfReader(BytesIO(uploaded_file.getvalue())) text = "" for page in pdf_reader.pages: extracted = page.extract_text() if extracted: text += extracted + "\n" if not text: raise ValueError("No text extracted from PDF.") # Chunk text (increased overlap for better context) text_splitter = CharacterTextSplitter(separator="\n", chunk_size=800, chunk_overlap=200, length_function=len) chunks = text_splitter.split_text(text) # Embeddings (light model) embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}) # Vector store vector_store = FAISS.from_texts(chunks, embedding=embeddings) logger.info("PDF processed successfully") return vector_store except Exception as e: logger.error(f"PDF processing error: {str(e)}") st.error(f"Error processing PDF: {str(e)}") return None # Function to answer questions def answer_question(vector_store, query): try: logger.info(f"Answering query: {query}") # Lighter LLM via pipeline for faster CPU inference qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base") # Retrieve top chunks docs = vector_store.similarity_search(query, k=3) context = "\n".join([doc.page_content for doc in docs]) # Prompt prompt = f"Use this context to answer concisely: {context}\nQuestion: {query}\nAnswer:" response = qa_pipeline(prompt, max_length=256, num_return_sequences=1)[0]['generated_text'] logger.info("Answer generated") return response.strip() except Exception as e: logger.error(f"Answer generation error: {str(e)}") st.error(f"Error answering: {str(e)}") return "Unable to generate answer." # Streamlit UI with chat history st.title("Smart PDF Q&A") st.write("Upload a PDF and ask questions! Chat history is preserved.") # Initialize session state if "messages" not in st.session_state: st.session_state.messages = [] if "vector_store" not in st.session_state: st.session_state.vector_store = None # PDF upload and process uploaded_file = st.file_uploader("Upload PDF", type="pdf") if uploaded_file: if st.button("Process PDF"): with st.spinner("Processing..."): vector_store = process_pdf(uploaded_file) if vector_store: st.session_state.vector_store = vector_store st.success("PDF ready! Ask away.") st.session_state.messages = [] # Reset chat on new PDF # Display chat history for message in st.session_state.messages: with st.chat_message(message["role"]): st.markdown(message["content"]) # Question input if st.session_state.vector_store: if prompt := st.chat_input("Ask a question:"): # Add user message st.session_state.messages.append({"role": "user", "content": prompt}) with st.chat_message("user"): st.markdown(prompt) # Generate answer with st.chat_message("assistant"): with st.spinner("Thinking..."): answer = answer_question(st.session_state.vector_store, prompt) st.markdown(answer) st.session_state.messages.append({"role": "assistant", "content": answer}) except Exception as e: logger.error(f"App initialization failed: {str(e)}") st.error(f"Initialization error: {str(e)}. Check logs or try factory reset.")