TANVEERMAKHDOOM's picture
Update app.py
c1ec99b verified
import gradio as gr
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
import faiss
import numpy as np
import requests
import os
# Initialize embedder model
embedder = SentenceTransformer("all-MiniLM-L6-v2")
# Groq API config
GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "your-groq-api-key")
GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"
# Function: Extract text from PDF
def extract_text_from_pdf(pdf_file):
reader = PdfReader(pdf_file)
text = ""
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
return text
# Function: Split text into chunks
def split_text_into_chunks(text, chunk_size=500, chunk_overlap=100):
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
return splitter.split_text(text)
# Function: Generate embeddings
def generate_embeddings(chunks):
return embedder.encode(chunks, show_progress_bar=False)
# Function: Build FAISS index
def build_faiss_index(embeddings):
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)
return index
# Function: Retrieve top-k chunks from index
def retrieve_chunks(query, index, chunks, top_k=3):
query_vector = embedder.encode([query])
D, I = index.search(np.array(query_vector), top_k)
return [chunks[i] for i in I[0]]
# Function: Query Groq with context
def query_groq_with_context(context, question):
headers = {
"Authorization": f"Bearer {GROQ_API_KEY}",
"Content-Type": "application/json"
}
data = {
"model": "llama3-8b-8192",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": f"Context:\n{context}\n\nQuestion:\n{question}"}
]
}
response = requests.post(GROQ_API_URL, headers=headers, json=data)
result = response.json()
return result['choices'][0]['message']['content']
# Global state to store chunks and index
state = {"chunks": None, "index": None}
# Gradio Functions
def process_pdf(pdf_file):
text = extract_text_from_pdf(pdf_file)
chunks = split_text_into_chunks(text)
embeddings = generate_embeddings(chunks)
index = build_faiss_index(np.array(embeddings))
state["chunks"] = chunks
state["index"] = index
return "βœ… PDF processed. You can now ask questions."
def answer_question(question):
if not state["chunks"] or not state["index"]:
return "❗Please upload and process a PDF first."
relevant_chunks = retrieve_chunks(question, state["index"], state["chunks"])
context = "\n\n".join(relevant_chunks)
return query_groq_with_context(context, question)
# Gradio Interface
with gr.Blocks() as demo:
gr.Markdown("# πŸ“„ RAG PDF Chatbot using Groq LLaMA 3")
with gr.Row():
pdf_input = gr.File(label="Upload PDF")
process_button = gr.Button("πŸ“₯ Process PDF")
status = gr.Textbox(label="Status")
process_button.click(fn=process_pdf, inputs=pdf_input, outputs=status)
question = gr.Textbox(label="Ask a question about the PDF")
ask_button = gr.Button("πŸ” Ask")
answer = gr.Textbox(label="Answer", lines=5)
ask_button.click(fn=answer_question, inputs=question, outputs=answer)
demo.launch()