Spaces:

ritampatra
/

Document_chatbot

Sleeping

App Files Files Community

ritampatra commited on Sep 22, 2024

Commit

e0b9cc5

verified ·

1 Parent(s): d154f38

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -48

app.py CHANGED Viewed

@@ -1,60 +1,64 @@
 import gradio as gr
-from transformers import AutoTokenizer, AutoModel
 import faiss
-import numpy as np
 import torch
 from PyPDF2 import PdfReader
-# Load PDF and extract text from it
-def load_document(file):
-    pdf = PdfReader(file)
-    text = ''
-    for page_num in range(len(pdf.pages)):
-        page = pdf.pages[page_num]
-        text += page.extract_text()
     return text
-# Embed the document using Hugging Face model
-def embed_text(text):
-    # Load tokenizer and model from Hugging Face
-    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
-    model = AutoModel.from_pretrained("sentence-transformers/all-mpnet-base-v2")
-    # Tokenize and embed text
-    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
-    with torch.no_grad():
-        outputs = model(**inputs)
-    embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling to get the embedding
-    return embeddings.squeeze().numpy()
-# Initialize FAISS index
-def initialize_faiss(embedding_size):
-    index = faiss.IndexFlatL2(embedding_size)
-    return index
-# Add document embeddings to FAISS index
-def add_to_index(index, embeddings):
-    index.add(embeddings)
-# Search the FAISS index for the best matching text
-def search_index(index, query_embedding, texts, top_k=3):
-    distances, indices = index.search(np.array([query_embedding]), top_k)
-    return [texts[i] for i in indices[0]]
-# Process the document and build the FAISS index
-def process_document(file):
-    text = load_document(file)
-    chunks = [text[i:i + 512] for i in range(0, len(text), 512)]  # Split text into chunks
-    embeddings = np.vstack([embed_text(chunk) for chunk in chunks])  # Create embeddings for each chunk
-    faiss_index = initialize_faiss(embeddings.shape[1])  # Initialize FAISS index
-    add_to_index(faiss_index, embeddings)  # Add embeddings to FAISS index
-    return faiss_index, chunks
-# Answer query by searching FAISS index
-def query_document(query, faiss_index, document_chunks):
-    query_embedding = embed_text(query)  # Embed query
-    results = search_index(faiss_index, query_embedding, document_chunks)  # Search for the best matching chunks
-    return "\n\n".join(results)  # Return the matching document parts
 # Gradio interface
 def chatbot_interface():
@@ -64,7 +68,7 @@ def chatbot_interface():
     # Function to handle document upload
     def upload_file(file):
         nonlocal faiss_index, document_chunks
-        faiss_index, document_chunks = process_document(file)
         return "Document uploaded and indexed. You can now ask questions."
     # Function to handle user queries
@@ -76,7 +80,7 @@ def chatbot_interface():
     # Gradio UI
     upload = gr.File(label="Upload a PDF document")
     question = gr.Textbox(label="Ask a question about the document")
-    answer = gr.Textbox(label="Answer", readonly=True)
     # Gradio app layout
     with gr.Blocks() as demo:

 import gradio as gr
+import os
+from transformers import pipeline
 import faiss
 import torch
 from PyPDF2 import PdfReader
+# Function to extract text from a PDF file
+def extract_text_from_pdf(pdf_file):
+    pdf_reader = PdfReader(pdf_file)
+    text = ""
+    for page_num in range(len(pdf_reader.pages)):
+        text += pdf_reader.pages[page_num].extract_text()
     return text
+# Function to split text into chunks
+def split_text_into_chunks(text, chunk_size=500):
+    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
+# Function to embed text chunks using a pre-trained model
+def embed_text_chunks(text_chunks, model_name="sentence-transformers/all-MiniLM-L6-v2"):
+    embedder = pipeline("feature-extraction", model=model_name)
+    embeddings = [embedder(chunk)[0][0] for chunk in text_chunks]
+    return torch.tensor(embeddings)
+# Function to build FAISS index for document chunks
+def build_faiss_index(embeddings):
+    d = embeddings.shape[1]  # Dimension of embeddings
+    index = faiss.IndexFlatL2(d)
+    index.add(embeddings.numpy())
+    return index
+# Function to process uploaded document
+def process_document(pdf_file):
+    # Extract text from the PDF
+    text = extract_text_from_pdf(pdf_file)
+    # Split text into chunks
+    document_chunks = split_text_into_chunks(text)
+    # Embed document chunks
+    embeddings = embed_text_chunks(document_chunks)
+    # Build FAISS index
+    faiss_index = build_faiss_index(embeddings)
+    return faiss_index, document_chunks
+# Function to query the FAISS index for a question
+def query_document(query, faiss_index, document_chunks, model_name="sentence-transformers/all-MiniLM-L6-v2"):
+    embedder = pipeline("feature-extraction", model=model_name)
+    # Embed the query
+    query_embedding = embedder(query)[0][0]
+    query_embedding = torch.tensor(query_embedding).unsqueeze(0).numpy()
+    # Search the FAISS index
+    _, I = faiss_index.search(query_embedding, k=1)
+    # Get the most relevant chunk
+    return document_chunks[I[0][0]]
 # Gradio interface
 def chatbot_interface():
     # Function to handle document upload
     def upload_file(file):
         nonlocal faiss_index, document_chunks
+        faiss_index, document_chunks = process_document(file.name)
         return "Document uploaded and indexed. You can now ask questions."
     # Function to handle user queries
     # Gradio UI
     upload = gr.File(label="Upload a PDF document")
     question = gr.Textbox(label="Ask a question about the document")
+    answer = gr.Textbox(label="Answer", interactive=False)  # Updated to interactive=False
     # Gradio app layout
     with gr.Blocks() as demo: