import os import asyncio import streamlit as st from crawl4ai import AsyncWebCrawler from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig from langchain_core.documents import Document from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint from langchain_huggingface.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import Chroma # Load API keys from Space Secrets os.environ["HUGGINGFACEHUB_API_KEY"] = st.secrets["hf"] os.environ["HF_TOKEN"] = st.secrets["hf"] async def run_pipeline(url: str, query: str): # 1️⃣ Crawler setup browser_config = BrowserConfig() run_config = CrawlerRunConfig() async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun(url=url, config=run_config) # 2️⃣ LangChain doc + split doc = Document(page_content=result.markdown.raw_markdown) text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) chunks = text_splitter.split_documents([doc]) # 3️⃣ Embedding + Chroma emb = HuggingFaceEmbeddings(model="avsolatorio/GIST-small-Embedding-v0") cb = Chroma(embedding_function=emb) cb.add_documents(chunks) # 4️⃣ Similarity search docs = cb.similarity_search(query, k=3) # 5️⃣ Llama3 via Nebius llama_model = HuggingFaceEndpoint( repo_id="meta-llama/Llama-3.1-8B-Instruct", provider="nebius", temperature=0.7, max_new_tokens=300, task="conversational" ) llama = ChatHuggingFace( llm=llama_model, repo_id="meta-llama/Llama-3.1-8B-Instruct", provider="nebius", temperature=0.7, max_new_tokens=300, task="conversational" ) response = llama.invoke( f"Context: {docs[0].page_content}\n\nQuestion: {query}" ) return response.content # Streamlit UI st.title("🌐🔍 Ask Any Website with Llama3") st.write("Enter a URL and your question — this app crawls the site and answers using Llama3!") url = st.text_input("📌 Website URL", placeholder="https://www.example.com") query = st.text_input("💬 Your Question", placeholder="What is this website about?") if st.button("🔎 Crawl & Answer"): if not url.strip() or not query.strip(): st.warning("❗ Please enter both a URL and a question.") else: with st.spinner("🕸️ Crawling website and generating answer..."): result = asyncio.run(run_pipeline(url, query)) st.success(f"✅ **Answer:** {result}")