Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from PyPDF2 import PdfReader
|
3 |
+
from sentence_transformers import SentenceTransformer
|
4 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
+
import faiss
|
6 |
+
import numpy as np
|
7 |
+
import requests
|
8 |
+
import os
|
9 |
+
|
10 |
+
# Initialize embedder model
|
11 |
+
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
12 |
+
|
13 |
+
# Groq API config
|
14 |
+
GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "your-groq-api-key")
|
15 |
+
GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"
|
16 |
+
|
17 |
+
# Function: Extract text from PDF
|
18 |
+
def extract_text_from_pdf(pdf_file):
|
19 |
+
reader = PdfReader(pdf_file)
|
20 |
+
text = ""
|
21 |
+
for page in reader.pages:
|
22 |
+
page_text = page.extract_text()
|
23 |
+
if page_text:
|
24 |
+
text += page_text + "\n"
|
25 |
+
return text
|
26 |
+
|
27 |
+
# Function: Split text into chunks
|
28 |
+
def split_text_into_chunks(text, chunk_size=500, chunk_overlap=100):
|
29 |
+
splitter = RecursiveCharacterTextSplitter(
|
30 |
+
chunk_size=chunk_size,
|
31 |
+
chunk_overlap=chunk_overlap
|
32 |
+
)
|
33 |
+
return splitter.split_text(text)
|
34 |
+
|
35 |
+
# Function: Generate embeddings
|
36 |
+
def generate_embeddings(chunks):
|
37 |
+
return embedder.encode(chunks, show_progress_bar=False)
|
38 |
+
|
39 |
+
# Function: Build FAISS index
|
40 |
+
def build_faiss_index(embeddings):
|
41 |
+
dim = embeddings.shape[1]
|
42 |
+
index = faiss.IndexFlatL2(dim)
|
43 |
+
index.add(embeddings)
|
44 |
+
return index
|
45 |
+
|
46 |
+
# Function: Retrieve top-k chunks from index
|
47 |
+
def retrieve_chunks(query, index, chunks, top_k=3):
|
48 |
+
query_vector = embedder.encode([query])
|
49 |
+
D, I = index.search(np.array(query_vector), top_k)
|
50 |
+
return [chunks[i] for i in I[0]]
|
51 |
+
|
52 |
+
# Function: Query Groq with context
|
53 |
+
def query_groq_with_context(context, question):
|
54 |
+
headers = {
|
55 |
+
"Authorization": f"Bearer {GROQ_API_KEY}",
|
56 |
+
"Content-Type": "application/json"
|
57 |
+
}
|
58 |
+
data = {
|
59 |
+
"model": "llama3-8b-8192",
|
60 |
+
"messages": [
|
61 |
+
{"role": "system", "content": "You are a helpful assistant."},
|
62 |
+
{"role": "user", "content": f"Context:\n{context}\n\nQuestion:\n{question}"}
|
63 |
+
]
|
64 |
+
}
|
65 |
+
response = requests.post(GROQ_API_URL, headers=headers, json=data)
|
66 |
+
result = response.json()
|
67 |
+
return result['choices'][0]['message']['content']
|
68 |
+
|
69 |
+
# Global state to store chunks and index
|
70 |
+
state = {"chunks": None, "index": None}
|
71 |
+
|
72 |
+
# Gradio Functions
|
73 |
+
def process_pdf(pdf_file):
|
74 |
+
text = extract_text_from_pdf(pdf_file)
|
75 |
+
chunks = split_text_into_chunks(text)
|
76 |
+
embeddings = generate_embeddings(chunks)
|
77 |
+
index = build_faiss_index(np.array(embeddings))
|
78 |
+
state["chunks"] = chunks
|
79 |
+
state["index"] = index
|
80 |
+
return "✅ PDF processed. You can now ask questions."
|
81 |
+
|
82 |
+
def answer_question(question):
|
83 |
+
if not state["chunks"] or not state["index"]:
|
84 |
+
return "❗Please upload and process a PDF first."
|
85 |
+
relevant_chunks = retrieve_chunks(question, state["index"], state["chunks"])
|
86 |
+
context = "\n\n".join(relevant_chunks)
|
87 |
+
return query_groq_with_context(context, question)
|
88 |
+
|
89 |
+
# Gradio Interface
|
90 |
+
with gr.Blocks() as demo:
|
91 |
+
gr.Markdown("# 📄 RAG PDF Chatbot using Groq LLaMA 3")
|
92 |
+
|
93 |
+
with gr.Row():
|
94 |
+
pdf_input = gr.File(label="Upload PDF")
|
95 |
+
process_button = gr.Button("📥 Process PDF")
|
96 |
+
status = gr.Textbox(label="Status")
|
97 |
+
|
98 |
+
process_button.click(fn=process_pdf, inputs=pdf_input, outputs=status)
|
99 |
+
|
100 |
+
question = gr.Textbox(label="Ask a question about the PDF")
|
101 |
+
ask_button = gr.Button("🔍 Ask")
|
102 |
+
answer = gr.Textbox(label="Answer", lines=5)
|
103 |
+
|
104 |
+
ask_button.click(fn=answer_question, inputs=question, outputs=answer)
|
105 |
+
|
106 |
+
demo.launch()
|