import os import gradio as gr import faiss import pickle from PyPDF2 import PdfReader from sentence_transformers import SentenceTransformer import pdfplumber from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import torch # Use FLAN-T5 instead of DeepSeek model_id = "google/flan-t5-small" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForSeq2SeqLM.from_pretrained(model_id) device = "cuda" if torch.cuda.is_available() else "cpu" model = model.to(device) model.eval() """def generate_answer(prompt): inputs = tokenizer(prompt, return_tensors="pt").to(device) with torch.no_grad(): outputs = model.generate(**inputs, max_new_tokens=512) return tokenizer.decode(outputs[0], skip_special_tokens=True)""" def generate_answer(prompt): inputs = tokenizer(prompt, return_tensors="pt").to(device) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=512, temperature=0.9, # Higher = more creative repetition_penalty=1.1, # Penalize repeating the same phrases do_sample=True, # Needed for temperature to work top_k=50, # Sample from top 50 tokens top_p=0.95 # Nucleus sampling ) return tokenizer.decode(outputs[0], skip_special_tokens=True) # Sentence embeddings embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") DATA_DIR = "data" os.makedirs(DATA_DIR, exist_ok=True) # Save uploaded PDF and index def save_pdf(file, title): folder = os.path.join(DATA_DIR, title.strip()) if os.path.exists(folder): return f"'{title}' already exists. Use a different title." os.makedirs(folder, exist_ok=True) chunks = [] page_numbers = [] with pdfplumber.open(file.name) as pdf: for i, page in enumerate(pdf.pages): text = page.extract_text() if text: for j in range(0, len(text), 500): chunk = text[j:j+500] chunks.append(chunk) page_numbers.append(i + 1) embeddings = embedder.encode(chunks) if len(embeddings.shape) != 2: raise ValueError(f"Expected 2D embeddings, got shape {embeddings.shape}") index = faiss.IndexFlatL2(embeddings.shape[1]) index.add(embeddings) index_path = os.path.join(folder, "index.faiss") chunks_path = os.path.join(folder, "chunks.pkl") faiss.write_index(index, index_path) with open(chunks_path, "wb") as f: pickle.dump({"chunks": chunks, "page_numbers": page_numbers}, f) return f"✅ Saved and indexed '{title}'. You can now ask questions." def list_titles(): return [d for d in os.listdir(DATA_DIR) if os.path.isdir(os.path.join(DATA_DIR, d))] # Ask question using PDF context def ask_question(message, history, selected_titles): if not selected_titles: return "❗ Please select at least one PDF." combined_answer = "" for title in selected_titles: folder = os.path.join(DATA_DIR, title) try: index = faiss.read_index(os.path.join(folder, "index.faiss")) with open(os.path.join(folder, "chunks.pkl"), "rb") as f: data = pickle.load(f) chunks = data["chunks"] page_numbers = data["page_numbers"] q_embed = embedder.encode([message]) D, I = index.search(q_embed, k=3) context = "\n".join([ f"(Page {page_numbers[i]}): {chunks[i]}" for i in I[0] ]) #prompt = f"""Answer the question using only the context below.\n\nContext:\n{context}\n\nQuestion: {message}""" prompt = f"""You are a helpful assistant. Provide a thorough and detailed answer to the following question using only the context. Context: {context} Question: {message} Answer in detail: """ response = generate_answer(prompt) combined_answer += f"**{title}**:\n{response.strip()}\n\n" except Exception as e: combined_answer += f"⚠️ Error with {title}: {str(e)}\n\n" return combined_answer.strip() # Gradio UI with gr.Blocks() as demo: with gr.Tab("📄 Upload PDF"): file = gr.File(label="PDF File", file_types=[".pdf"]) title = gr.Textbox(label="Title for PDF") upload_btn = gr.Button("Upload and Index") upload_status = gr.Textbox(label="Status") upload_btn.click(fn=save_pdf, inputs=[file, title], outputs=upload_status) with gr.Tab("💭 Chat with PDFs"): pdf_selector = gr.CheckboxGroup(label="Select PDFs", choices=list_titles()) refresh_btn = gr.Button("🔄 Refresh PDF List") refresh_btn.click(fn=list_titles, outputs=pdf_selector) chat = gr.ChatInterface(fn=ask_question, additional_inputs=[pdf_selector]) demo.launch()