|
import os |
|
import gdown |
|
import streamlit as st |
|
from PyPDF2 import PdfReader |
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
from langchain_community.vectorstores import FAISS |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from groq import Groq |
|
|
|
|
|
client = Groq(api_key=os.environ['GROQ_API_KEY']) |
|
|
|
|
|
def download_pdf(url): |
|
output_path = "/tmp/drive_doc.pdf" |
|
try: |
|
gdown.download(url=url, output=output_path, quiet=True, fuzzy=True) |
|
return output_path |
|
except Exception as e: |
|
st.error(f"❌ Download failed: {e}") |
|
return None |
|
|
|
|
|
def extract_text(pdf_path): |
|
reader = PdfReader(pdf_path) |
|
text = "" |
|
for page in reader.pages: |
|
content = page.extract_text() |
|
if content: |
|
text += content |
|
return text |
|
|
|
|
|
def chunk_text(text): |
|
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) |
|
return splitter.split_text(text) |
|
|
|
|
|
def build_vector_db(chunks): |
|
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") |
|
return FAISS.from_texts(chunks, embedding=embeddings) |
|
|
|
|
|
def query_groq(query, vector_db): |
|
docs = vector_db.similarity_search(query, k=3) |
|
context = "\n".join([doc.page_content for doc in docs]) |
|
response = client.chat.completions.create( |
|
model="llama3-8b-8192", |
|
messages=[ |
|
{"role": "system", "content": f"Use the following context:\n{context}"}, |
|
{"role": "user", "content": query}, |
|
] |
|
) |
|
return response.choices[0].message.content |
|
|
|
|
|
|
|
st.title("📄 RAG QA from Google Drive PDF") |
|
|
|
link = "https://drive.google.com/file/d/1SGXNLO841VyHnGiX81oo6x2RHIrTmP5S/view?usp=sharing" |
|
|
|
st.write("📥 Downloading and processing document...") |
|
pdf_path = download_pdf(link) |
|
|
|
if pdf_path: |
|
try: |
|
text = extract_text(pdf_path) |
|
chunks = chunk_text(text) |
|
vector_db = build_vector_db(chunks) |
|
st.success("✅ Document processed successfully.") |
|
except Exception as e: |
|
st.error(f"❌ Error processing PDF: {e}") |
|
vector_db = None |
|
else: |
|
vector_db = None |
|
|
|
query = st.text_input("🔍 Enter your query:") |
|
if query and vector_db: |
|
answer = query_groq(query, vector_db) |
|
st.subheader("💬 Answer:") |
|
st.write(answer) |
|
elif query: |
|
st.warning("⚠️ Document not ready yet.") |
|
|