Spaces:
Sleeping
Sleeping
import gradio as gr | |
from PyPDF2 import PdfReader | |
from sentence_transformers import SentenceTransformer | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
import faiss | |
import numpy as np | |
import requests | |
import os | |
# Initialize embedder model | |
embedder = SentenceTransformer("all-MiniLM-L6-v2") | |
# Groq API config | |
GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "your-groq-api-key") | |
GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions" | |
# Function: Extract text from PDF | |
def extract_text_from_pdf(pdf_file): | |
reader = PdfReader(pdf_file) | |
text = "" | |
for page in reader.pages: | |
page_text = page.extract_text() | |
if page_text: | |
text += page_text + "\n" | |
return text | |
# Function: Split text into chunks | |
def split_text_into_chunks(text, chunk_size=500, chunk_overlap=100): | |
splitter = RecursiveCharacterTextSplitter( | |
chunk_size=chunk_size, | |
chunk_overlap=chunk_overlap | |
) | |
return splitter.split_text(text) | |
# Function: Generate embeddings | |
def generate_embeddings(chunks): | |
return embedder.encode(chunks, show_progress_bar=False) | |
# Function: Build FAISS index | |
def build_faiss_index(embeddings): | |
dim = embeddings.shape[1] | |
index = faiss.IndexFlatL2(dim) | |
index.add(embeddings) | |
return index | |
# Function: Retrieve top-k chunks from index | |
def retrieve_chunks(query, index, chunks, top_k=3): | |
query_vector = embedder.encode([query]) | |
D, I = index.search(np.array(query_vector), top_k) | |
return [chunks[i] for i in I[0]] | |
# Function: Query Groq with context | |
def query_groq_with_context(context, question): | |
headers = { | |
"Authorization": f"Bearer {GROQ_API_KEY}", | |
"Content-Type": "application/json" | |
} | |
data = { | |
"model": "llama3-8b-8192", | |
"messages": [ | |
{"role": "system", "content": "You are a helpful assistant."}, | |
{"role": "user", "content": f"Context:\n{context}\n\nQuestion:\n{question}"} | |
] | |
} | |
response = requests.post(GROQ_API_URL, headers=headers, json=data) | |
result = response.json() | |
return result['choices'][0]['message']['content'] | |
# Global state to store chunks and index | |
state = {"chunks": None, "index": None} | |
# Gradio Functions | |
def process_pdf(pdf_file): | |
text = extract_text_from_pdf(pdf_file) | |
chunks = split_text_into_chunks(text) | |
embeddings = generate_embeddings(chunks) | |
index = build_faiss_index(np.array(embeddings)) | |
state["chunks"] = chunks | |
state["index"] = index | |
return "β PDF processed. You can now ask questions." | |
def answer_question(question): | |
if not state["chunks"] or not state["index"]: | |
return "βPlease upload and process a PDF first." | |
relevant_chunks = retrieve_chunks(question, state["index"], state["chunks"]) | |
context = "\n\n".join(relevant_chunks) | |
return query_groq_with_context(context, question) | |
# Gradio Interface | |
with gr.Blocks() as demo: | |
gr.Markdown("# π RAG PDF Chatbot using Groq LLaMA 3") | |
with gr.Row(): | |
pdf_input = gr.File(label="Upload PDF") | |
process_button = gr.Button("π₯ Process PDF") | |
status = gr.Textbox(label="Status") | |
process_button.click(fn=process_pdf, inputs=pdf_input, outputs=status) | |
question = gr.Textbox(label="Ask a question about the PDF") | |
ask_button = gr.Button("π Ask") | |
answer = gr.Textbox(label="Answer", lines=5) | |
ask_button.click(fn=answer_question, inputs=question, outputs=answer) | |
demo.launch() | |