File size: 2,575 Bytes
d58b556
4bf11bb
 
d58b556
4bf11bb
 
 
 
d58b556
4bf11bb
 
d58b556
f9a9ebf
 
 
4bf11bb
f9a9ebf
4bf11bb
 
f9a9ebf
d58b556
 
4bf11bb
f9a9ebf
 
d58b556
f9a9ebf
 
 
 
d58b556
 
4bf11bb
f9a9ebf
 
 
d58b556
f9a9ebf
 
d58b556
f9a9ebf
d58b556
f9a9ebf
 
d58b556
 
f9a9ebf
 
d58b556
 
 
f9a9ebf
d58b556
f9a9ebf
d58b556
f9a9ebf
d58b556
f9a9ebf
d58b556
f9a9ebf
d58b556
f9a9ebf
 
d58b556
f9a9ebf
 
 
 
 
 
 
 
 
 
 
96abe9d
f9a9ebf
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import os
import gdown
import streamlit as st
from PyPDF2 import PdfReader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from groq import Groq

# Initialize Groq client
client = Groq(api_key=os.environ['GROQ_API_KEY'])

# Download and save PDF using gdown
def download_pdf(url):
    output_path = "/tmp/drive_doc.pdf"
    try:
        gdown.download(url=url, output=output_path, quiet=True, fuzzy=True)
        return output_path
    except Exception as e:
        st.error(f"❌ Download failed: {e}")
        return None

# Extract text from PDF
def extract_text(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        content = page.extract_text()
        if content:
            text += content
    return text

# Split text into chunks
def chunk_text(text):
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    return splitter.split_text(text)

# Create embeddings and store in FAISS
def build_vector_db(chunks):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return FAISS.from_texts(chunks, embedding=embeddings)

# Query the vector DB and get response from Groq
def query_groq(query, vector_db):
    docs = vector_db.similarity_search(query, k=3)
    context = "\n".join([doc.page_content for doc in docs])
    response = client.chat.completions.create(
        model="llama3-8b-8192",
        messages=[
            {"role": "system", "content": f"Use the following context:\n{context}"},
            {"role": "user", "content": query},
        ]
    )
    return response.choices[0].message.content

# --- Streamlit App ---

st.title("📄 RAG QA from Google Drive PDF")

link = "https://drive.google.com/file/d/1SGXNLO841VyHnGiX81oo6x2RHIrTmP5S/view?usp=sharing"

st.write("📥 Downloading and processing document...")
pdf_path = download_pdf(link)

if pdf_path:
    try:
        text = extract_text(pdf_path)
        chunks = chunk_text(text)
        vector_db = build_vector_db(chunks)
        st.success("✅ Document processed successfully.")
    except Exception as e:
        st.error(f"❌ Error processing PDF: {e}")
        vector_db = None
else:
    vector_db = None

query = st.text_input("🔍 Enter your query:")
if query and vector_db:
    answer = query_groq(query, vector_db)
    st.subheader("💬 Answer:")
    st.write(answer)
elif query:
    st.warning("⚠️ Document not ready yet.")