File size: 2,575 Bytes
d58b556 4bf11bb d58b556 4bf11bb d58b556 4bf11bb d58b556 f9a9ebf 4bf11bb f9a9ebf 4bf11bb f9a9ebf d58b556 4bf11bb f9a9ebf d58b556 f9a9ebf d58b556 4bf11bb f9a9ebf d58b556 f9a9ebf d58b556 f9a9ebf d58b556 f9a9ebf d58b556 f9a9ebf d58b556 f9a9ebf d58b556 f9a9ebf d58b556 f9a9ebf d58b556 f9a9ebf d58b556 f9a9ebf d58b556 f9a9ebf d58b556 f9a9ebf 96abe9d f9a9ebf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import os
import gdown
import streamlit as st
from PyPDF2 import PdfReader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from groq import Groq
# Initialize Groq client
client = Groq(api_key=os.environ['GROQ_API_KEY'])
# Download and save PDF using gdown
def download_pdf(url):
output_path = "/tmp/drive_doc.pdf"
try:
gdown.download(url=url, output=output_path, quiet=True, fuzzy=True)
return output_path
except Exception as e:
st.error(f"❌ Download failed: {e}")
return None
# Extract text from PDF
def extract_text(pdf_path):
reader = PdfReader(pdf_path)
text = ""
for page in reader.pages:
content = page.extract_text()
if content:
text += content
return text
# Split text into chunks
def chunk_text(text):
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
return splitter.split_text(text)
# Create embeddings and store in FAISS
def build_vector_db(chunks):
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
return FAISS.from_texts(chunks, embedding=embeddings)
# Query the vector DB and get response from Groq
def query_groq(query, vector_db):
docs = vector_db.similarity_search(query, k=3)
context = "\n".join([doc.page_content for doc in docs])
response = client.chat.completions.create(
model="llama3-8b-8192",
messages=[
{"role": "system", "content": f"Use the following context:\n{context}"},
{"role": "user", "content": query},
]
)
return response.choices[0].message.content
# --- Streamlit App ---
st.title("📄 RAG QA from Google Drive PDF")
link = "https://drive.google.com/file/d/1SGXNLO841VyHnGiX81oo6x2RHIrTmP5S/view?usp=sharing"
st.write("📥 Downloading and processing document...")
pdf_path = download_pdf(link)
if pdf_path:
try:
text = extract_text(pdf_path)
chunks = chunk_text(text)
vector_db = build_vector_db(chunks)
st.success("✅ Document processed successfully.")
except Exception as e:
st.error(f"❌ Error processing PDF: {e}")
vector_db = None
else:
vector_db = None
query = st.text_input("🔍 Enter your query:")
if query and vector_db:
answer = query_groq(query, vector_db)
st.subheader("💬 Answer:")
st.write(answer)
elif query:
st.warning("⚠️ Document not ready yet.")
|