# app.py import os import logging import re import requests import numpy as np import faiss import gradio as gr from bs4 import BeautifulSoup from sentence_transformers import SentenceTransformer from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores.faiss import FAISS from langchain.llms import Together from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.chains.summarize import load_summarize_chain from langchain.docstore.document import Document from langchain.chains import RetrievalQA # Load your Together API key securely (recommended on HF Spaces) TOGETHER_API_KEY = os.environ.get("TOGETHER_API_KEY") # Logging setup logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Load models logger.info("🔍 Loading sentence transformer and LLM...") embed_model = SentenceTransformer("all-MiniLM-L6-v2") llm = Together( model="togethercomputer/llama-3-70b-chat", temperature=0.7, max_tokens=512, together_api_key=TOGETHER_API_KEY, ) # Global cache vector_index = None doc_chunks = [] doc_texts = [] doc_embeddings = [] # Helper Functions def fetch_webpage_text(url): try: response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") content = soup.find("div", {"id": "mw-content-text"}) or soup.body return content.get_text(separator="\n", strip=True) except Exception as e: logger.error(f"❌ Error fetching content: {e}") return "" def clean_text(text): text = re.sub(r'\[\s*\d+\s*\]', '', text) text = re.sub(r'\[\s*[a-zA-Z]+\s*\]', '', text) text = re.sub(r'\n{2,}', '\n', text) text = re.sub(r'[ \t]+', ' ', text) return text.strip() def chunk_text(text, chunk_size=500, overlap=50): splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=overlap ) return splitter.split_text(text) def create_vectorstore(chunks): texts = [chunk for chunk in chunks] embeddings = [embed_model.encode(text) for text in texts] dim = embeddings[0].shape[0] index = faiss.IndexFlatL2(dim) index.add(np.array(embeddings).astype(np.float32)) return index, texts, embeddings def get_summary(chunks): full_doc = Document(page_content="\n\n".join(chunks)) summarize_chain = load_summarize_chain(llm, chain_type="map_reduce") return summarize_chain.run([full_doc]) def chat_with_bot(question): if not doc_chunks or not doc_embeddings: return "⚠️ Please load a webpage and summarize it first." query_vector = embed_model.encode(question).astype(np.float32) index = faiss.IndexFlatL2(doc_embeddings[0].shape[0]) index.add(np.array(doc_embeddings).astype(np.float32)) D, I = index.search(np.array([query_vector]), k=5) top_chunks = [doc_texts[i] for i in I[0]] rag_doc = "\n\n".join(top_chunks) qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=None) return qa_chain.run(input_documents=[Document(page_content=rag_doc)], question=question) def summarize_content(): if not doc_chunks: return "⚠️ No content loaded yet. Please load a valid webpage." return get_summary(doc_chunks) def process_webpage_and_load(url): global doc_chunks, vector_index, doc_texts, doc_embeddings logger.info(f"🌐 Loading URL: {url}") text = fetch_webpage_text(url) if not text: return "❌ Failed to load or parse webpage." cleaned = clean_text(text) doc_chunks = chunk_text(cleaned) vector_index, doc_texts, doc_embeddings = create_vectorstore(doc_chunks) return "✅ Webpage content processed and ready!" # Gradio UI with gr.Blocks() as demo: gr.Markdown("## 🤖 Chat with LLaMA Webpage Content") with gr.Row(): chatbot = gr.Chatbot(label="Chat History") with gr.Row(): question = gr.Textbox( label="Ask your question about LLaMA", placeholder="e.g., Who developed LLaMA?" ) ask_btn = gr.Button("Submit") clear_btn = gr.Button("Clear Chat") summary_output = gr.Textbox(label="📋 Summary of the Webpage", lines=8) summarize_btn = gr.Button("Summarize Content") # Button logic def user_chat_handler(q, history): response = chat_with_bot(q) history.append((q, response)) return history, "" ask_btn.click(fn=user_chat_handler, inputs=[question, chatbot], outputs=[chatbot, question]) clear_btn.click(lambda: [], None, chatbot) summarize_btn.click(fn=summarize_content, inputs=[], outputs=summary_output) demo.launch()