Spaces:

Talha812
/

RAG-Based-Chatbot

Sleeping

App Files Files Community

RAG-Based-Chatbot / app.py

Talha812

Update app.py

ab1a53e verified about 1 month ago

raw

history blame contribute delete

3.21 kB

	# app.py

	import os
	import json
	import faiss
	import numpy as np
	import PyPDF2
	import requests
	import streamlit as st
	from groq import Groq

	# Constants
	PDF_URL = "https://drive.google.com/uc?export=download&id=1YWX-RYxgtcKO1QETnz1N3rboZUhRZwcH"
	VECTOR_DIM = 768
	CHUNK_SIZE = 512

	# Function to download and extract text from the PDF
	def extract_text_from_pdf(url):
	response = requests.get(url)
	with open("document.pdf", "wb") as f:
	f.write(response.content)

	with open("document.pdf", "rb") as f:
	reader = PyPDF2.PdfReader(f)
	text = "\n".join(page.extract_text() for page in reader.pages)
	return text

	# Function to split text into chunks
	def create_chunks(text, chunk_size):
	words = text.split()
	chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
	return chunks

	# Function to create FAISS vector store
	def create_faiss_index(chunks, vector_dim):
	# Check if GPU is available and use it
	if faiss.get_num_gpus() > 0:
	st.write("Using GPU for FAISS indexing.")
	resource = faiss.StandardGpuResources() # Initialize GPU resources
	index_flat = faiss.IndexFlatL2(vector_dim)
	index = faiss.index_cpu_to_gpu(resource, 0, index_flat)
	else:
	st.write("Using CPU for FAISS indexing.")
	index = faiss.IndexFlatL2(vector_dim)

	embeddings = np.random.rand(len(chunks), vector_dim).astype('float32') # Replace with real embeddings
	index.add(embeddings)
	return index, embeddings

	# Initialize Groq API client
	def get_groq_client():
	return Groq(api_key=os.environ.get("GROQ_API_KEY"))

	# Query Groq model
	def query_model(client, question):
	chat_completion = client.chat.completions.create(
	messages=[{"role": "user", "content": question}],
	model="llama-3.3-70b-versatile",
	)
	return chat_completion.choices[0].message.content

	# Streamlit app
	def main():
	st.title("RAG-Based ChatBot")

	# Step 1: Extract text from the document
	st.header("Step 1: Extract Text")
	if st.button("Extract Text from PDF"):
	text = extract_text_from_pdf(PDF_URL)
	st.session_state["text"] = text
	st.success("Text extracted successfully!")

	# Step 2: Chunk the text
	st.header("Step 2: Create Chunks")
	if "text" in st.session_state and st.button("Create Chunks"):
	chunks = create_chunks(st.session_state["text"], CHUNK_SIZE)
	st.session_state["chunks"] = chunks
	st.success(f"Created {len(chunks)} chunks.")

	# Step 3: Create FAISS index
	st.header("Step 3: Create Vector Database")
	if "chunks" in st.session_state and st.button("Create Vector Database"):
	index, embeddings = create_faiss_index(st.session_state["chunks"], VECTOR_DIM)
	st.session_state["index"] = index
	st.success("FAISS vector database created.")

	# Step 4: Ask a question
	st.header("Step 4: Query the Model")
	question = st.text_input("Ask a question about the document:")
	if question and "index" in st.session_state:
	client = get_groq_client()
	answer = query_model(client, question)
	st.write("Answer:", answer)

	if __name__ == "__main__":
	main()