# ------------- app.py ------------- import streamlit as st from pathlib import Path from io import BytesIO import pdfplumber, pytesseract, time, re, logging, os from PIL import Image from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.vectorstores import FAISS from sentence_transformers import SentenceTransformer from transformers import pipeline import numpy as np logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) ############################################################################### # Page layout ############################################################################### st.set_page_config(page_title="PDF Chat & Summarize", layout="wide") st.markdown(""" """, unsafe_allow_html=True) ############################################################################### # Cached heavy objects ############################################################################### @st.cache_resource(show_spinner=False) def load_embed(): return SentenceTransformer("all-MiniLM-L6-v2") @st.cache_resource(show_spinner=False) def load_qa(): return pipeline("text2text-generation", model="google/flan-t5-large", max_length=512) @st.cache_resource(show_spinner=False) def load_sum(): return pipeline("summarization", model="facebook/bart-large-cnn", max_length=250) embed = load_embed() qa_pipe = load_qa() sum_pipe = load_sum() ############################################################################### # Helpers ############################################################################### def extract_pdf(uploaded_file): """Return (plain text, image_list)""" text = "" images = [] with pdfplumber.open(BytesIO(uploaded_file.getbuffer())) as pdf: for page in pdf.pages: txt = page.extract_text_layout() or page.extract_text() if not txt: img = page.to_image(resolution=200).original txt = pytesseract.image_to_string(img) text += txt + "\n" for img in page.images: try: x0, y0, x1, y1 = img["x0"], img["y0"], img["x1"], img["y1"] pil = page.within_bbox((x0, y0, x1, y1)).to_image(resolution=200).original images.append(pil) except Exception: pass return text.strip(), images def build_index(text): splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=80) chunks = splitter.split_text(text) vectors = embed.encode(chunks, show_progress_bar=False, batch_size=64) index = FAISS.from_embeddings(list(zip(chunks, vectors)), embed) return index def summarize(text): if len(text) < 50: return "Document too short to summarize." # pick top 3k chars to stay within model limit truncated = text[:3000] return sum_pipe(truncated, max_length=250, min_length=60, do_sample=False)[0]["summary_text"] def answer(question, index): if index is None: return "Please upload & process a PDF first." docs = index.similarity_search(question, k=4) context = "\n".join([d.page_content for d in docs]) prompt = f"Answer the question using ONLY the context below.\n\nContext:\n{context}\n\nQuestion: {question}" return qa_pipe(prompt, max_length=256, do_sample=False)[0]["generated_text"] ############################################################################### # Session init ############################################################################### if "messages" not in st.session_state: st.session_state.messages = [] if "index" not in st.session_state: st.session_state.index = None if "raw_text" not in st.session_state: st.session_state.raw_text = "" if "images" not in st.session_state: st.session_state.images = [] ############################################################################### # Sidebar ############################################################################### with st.sidebar: st.subheader("📁 Upload PDF") uploaded = st.file_uploader("Choose a file", type="pdf", label_visibility="collapsed") if uploaded and st.button("Process PDF"): with st.spinner("Extracting text & images…"): st.session_state.raw_text, st.session_state.images = extract_pdf(uploaded) st.session_state.index = build_index(st.session_state.raw_text) st.session_state.messages = [] st.toast("PDF ready!") if st.session_state.images: st.subheader("🖼️ Extracted Images") for im in st.session_state.images: st.image(im, use_column_width=True) ############################################################################### # Main Tabs ############################################################################### tab_chat, tab_sum = st.tabs(["💬 Chat", "📄 Summarize"]) with tab_chat: if st.session_state.index is None: st.info("Upload & process a PDF first using the sidebar.") else: # history for role, msg in st.session_state.messages: css = "user" if role == "user" else "assistant" st.markdown(f'
{msg}
', unsafe_allow_html=True) # input if question := st.chat_input("Ask anything about the PDF…"): st.session_state.messages.append(("user", question)) st.markdown(f'
{question}
', unsafe_allow_html=True) with st.spinner("Thinking…"): resp = answer(question, st.session_state.index) st.session_state.messages.append(("assistant", resp)) st.markdown(f'
{resp}
', unsafe_allow_html=True) with tab_sum: if not st.session_state.raw_text: st.info("Upload & process a PDF first.") else: if st.button("Generate Summary"): with st.spinner("Summarizing…"): summary = summarize(st.session_state.raw_text) st.subheader("Summary") st.write(summary)