Spaces:
Sleeping
Sleeping
File size: 2,753 Bytes
be5957e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import os
import asyncio
import streamlit as st
from crawl4ai import AsyncWebCrawler
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
# Load API keys from Space Secrets
os.environ["HUGGINGFACEHUB_API_KEY"] = st.secrets["hf"]
os.environ["HF_TOKEN"] = st.secrets["hf"]
async def run_pipeline(url: str, query: str):
# 1οΈβ£ Crawler setup
browser_config = BrowserConfig()
run_config = CrawlerRunConfig()
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url=url, config=run_config)
# 2οΈβ£ LangChain doc + split
doc = Document(page_content=result.markdown.raw_markdown)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.split_documents([doc])
# 3οΈβ£ Embedding + Chroma
emb = HuggingFaceEmbeddings(model="avsolatorio/GIST-small-Embedding-v0")
cb = Chroma(embedding_function=emb)
cb.add_documents(chunks)
# 4οΈβ£ Similarity search
docs = cb.similarity_search(query, k=3)
# 5οΈβ£ Llama3 via Nebius
llama_model = HuggingFaceEndpoint(
repo_id="meta-llama/Llama-3.1-8B-Instruct",
provider="nebius",
temperature=0.7,
max_new_tokens=300,
task="conversational"
)
llama = ChatHuggingFace(
llm=llama_model,
repo_id="meta-llama/Llama-3.1-8B-Instruct",
provider="nebius",
temperature=0.7,
max_new_tokens=300,
task="conversational"
)
response = llama.invoke(
f"Context: {docs[0].page_content}\n\nQuestion: {query}"
)
return response.content
# Streamlit UI
st.title("ππ Ask Any Website with Llama3")
st.write("Enter a URL and your question β this app crawls the site and answers using Llama3!")
url = st.text_input("π Website URL", placeholder="https://www.example.com")
query = st.text_input("π¬ Your Question", placeholder="What is this website about?")
if st.button("π Crawl & Answer"):
if not url.strip() or not query.strip():
st.warning("β Please enter both a URL and a question.")
else:
with st.spinner("πΈοΈ Crawling website and generating answer..."):
result = asyncio.run(run_pipeline(url, query))
st.success(f"β
**Answer:** {result}") |