import os import gdown import streamlit as st from PyPDF2 import PdfReader from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from langchain.text_splitter import RecursiveCharacterTextSplitter from groq import Groq # Initialize Groq client client = Groq(api_key=os.environ['GROQ_API_KEY']) # Download and save PDF using gdown def download_pdf(url): output_path = "/tmp/drive_doc.pdf" try: gdown.download(url=url, output=output_path, quiet=True, fuzzy=True) return output_path except Exception as e: st.error(f"❌ Download failed: {e}") return None # Extract text from PDF def extract_text(pdf_path): reader = PdfReader(pdf_path) text = "" for page in reader.pages: content = page.extract_text() if content: text += content return text # Split text into chunks def chunk_text(text): splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) return splitter.split_text(text) # Create embeddings and store in FAISS def build_vector_db(chunks): embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") return FAISS.from_texts(chunks, embedding=embeddings) # Query the vector DB and get response from Groq def query_groq(query, vector_db): docs = vector_db.similarity_search(query, k=3) context = "\n".join([doc.page_content for doc in docs]) response = client.chat.completions.create( model="llama3-8b-8192", messages=[ {"role": "system", "content": f"Use the following context:\n{context}"}, {"role": "user", "content": query}, ] ) return response.choices[0].message.content # --- Streamlit App --- st.title("📄 RAG QA from Google Drive PDF") link = "https://drive.google.com/file/d/1SGXNLO841VyHnGiX81oo6x2RHIrTmP5S/view?usp=sharing" st.write("📥 Downloading and processing document...") pdf_path = download_pdf(link) if pdf_path: try: text = extract_text(pdf_path) chunks = chunk_text(text) vector_db = build_vector_db(chunks) st.success("✅ Document processed successfully.") except Exception as e: st.error(f"❌ Error processing PDF: {e}") vector_db = None else: vector_db = None query = st.text_input("🔍 Enter your query:") if query and vector_db: answer = query_groq(query, vector_db) st.subheader("💬 Answer:") st.write(answer) elif query: st.warning("⚠️ Document not ready yet.")