TANVEERMAKHDOOM commited on
Commit
c1ec99b
·
verified ·
1 Parent(s): 32fba68

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -0
app.py CHANGED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from PyPDF2 import PdfReader
3
+ from sentence_transformers import SentenceTransformer
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ import faiss
6
+ import numpy as np
7
+ import requests
8
+ import os
9
+
10
+ # Initialize embedder model
11
+ embedder = SentenceTransformer("all-MiniLM-L6-v2")
12
+
13
+ # Groq API config
14
+ GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "your-groq-api-key")
15
+ GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"
16
+
17
+ # Function: Extract text from PDF
18
+ def extract_text_from_pdf(pdf_file):
19
+ reader = PdfReader(pdf_file)
20
+ text = ""
21
+ for page in reader.pages:
22
+ page_text = page.extract_text()
23
+ if page_text:
24
+ text += page_text + "\n"
25
+ return text
26
+
27
+ # Function: Split text into chunks
28
+ def split_text_into_chunks(text, chunk_size=500, chunk_overlap=100):
29
+ splitter = RecursiveCharacterTextSplitter(
30
+ chunk_size=chunk_size,
31
+ chunk_overlap=chunk_overlap
32
+ )
33
+ return splitter.split_text(text)
34
+
35
+ # Function: Generate embeddings
36
+ def generate_embeddings(chunks):
37
+ return embedder.encode(chunks, show_progress_bar=False)
38
+
39
+ # Function: Build FAISS index
40
+ def build_faiss_index(embeddings):
41
+ dim = embeddings.shape[1]
42
+ index = faiss.IndexFlatL2(dim)
43
+ index.add(embeddings)
44
+ return index
45
+
46
+ # Function: Retrieve top-k chunks from index
47
+ def retrieve_chunks(query, index, chunks, top_k=3):
48
+ query_vector = embedder.encode([query])
49
+ D, I = index.search(np.array(query_vector), top_k)
50
+ return [chunks[i] for i in I[0]]
51
+
52
+ # Function: Query Groq with context
53
+ def query_groq_with_context(context, question):
54
+ headers = {
55
+ "Authorization": f"Bearer {GROQ_API_KEY}",
56
+ "Content-Type": "application/json"
57
+ }
58
+ data = {
59
+ "model": "llama3-8b-8192",
60
+ "messages": [
61
+ {"role": "system", "content": "You are a helpful assistant."},
62
+ {"role": "user", "content": f"Context:\n{context}\n\nQuestion:\n{question}"}
63
+ ]
64
+ }
65
+ response = requests.post(GROQ_API_URL, headers=headers, json=data)
66
+ result = response.json()
67
+ return result['choices'][0]['message']['content']
68
+
69
+ # Global state to store chunks and index
70
+ state = {"chunks": None, "index": None}
71
+
72
+ # Gradio Functions
73
+ def process_pdf(pdf_file):
74
+ text = extract_text_from_pdf(pdf_file)
75
+ chunks = split_text_into_chunks(text)
76
+ embeddings = generate_embeddings(chunks)
77
+ index = build_faiss_index(np.array(embeddings))
78
+ state["chunks"] = chunks
79
+ state["index"] = index
80
+ return "✅ PDF processed. You can now ask questions."
81
+
82
+ def answer_question(question):
83
+ if not state["chunks"] or not state["index"]:
84
+ return "❗Please upload and process a PDF first."
85
+ relevant_chunks = retrieve_chunks(question, state["index"], state["chunks"])
86
+ context = "\n\n".join(relevant_chunks)
87
+ return query_groq_with_context(context, question)
88
+
89
+ # Gradio Interface
90
+ with gr.Blocks() as demo:
91
+ gr.Markdown("# 📄 RAG PDF Chatbot using Groq LLaMA 3")
92
+
93
+ with gr.Row():
94
+ pdf_input = gr.File(label="Upload PDF")
95
+ process_button = gr.Button("📥 Process PDF")
96
+ status = gr.Textbox(label="Status")
97
+
98
+ process_button.click(fn=process_pdf, inputs=pdf_input, outputs=status)
99
+
100
+ question = gr.Textbox(label="Ask a question about the PDF")
101
+ ask_button = gr.Button("🔍 Ask")
102
+ answer = gr.Textbox(label="Answer", lines=5)
103
+
104
+ ask_button.click(fn=answer_question, inputs=question, outputs=answer)
105
+
106
+ demo.launch()