Spaces:
Running
Running
Upload app.py
Browse files
app.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from scrape import scrape_website, extract_body_content, clean_body_content, split_dom_content
|
3 |
+
from parse import parse_with_groq
|
4 |
+
|
5 |
+
# Streamlit UI with sidebar
|
6 |
+
st.set_page_config(page_title="Web Scraping App π§ ", page_icon="π")
|
7 |
+
|
8 |
+
st.sidebar.title("π Model Selection")
|
9 |
+
selected_model = st.sidebar.selectbox(
|
10 |
+
"Choose a Model for Parsing:",
|
11 |
+
[
|
12 |
+
"llama3-8b-8192",
|
13 |
+
"distil-whisper-large-v3-en",
|
14 |
+
"llama3-groq-70b-8192-tool-use-preview",
|
15 |
+
"llama-3.1-8b-instant",
|
16 |
+
"llava-v1.5-7b-4096-preview",
|
17 |
+
"mixtral-8x7b-32768",
|
18 |
+
]
|
19 |
+
)
|
20 |
+
|
21 |
+
# Application title
|
22 |
+
st.title("AI Web Scraper App π")
|
23 |
+
st.write("Easily scrape and analyze web content using advanced AI models. π")
|
24 |
+
|
25 |
+
# Input for website URL
|
26 |
+
url = st.text_input("Enter Website URL π")
|
27 |
+
|
28 |
+
# Step 1: Scrape the Website
|
29 |
+
if st.button("Scrape Website"):
|
30 |
+
if url:
|
31 |
+
st.write("π΅οΈββοΈ Scraping the website...")
|
32 |
+
|
33 |
+
# Scrape the website
|
34 |
+
dom_content = scrape_website(url)
|
35 |
+
body_content = extract_body_content(dom_content)
|
36 |
+
cleaned_content = clean_body_content(body_content)
|
37 |
+
|
38 |
+
# Store the DOM content in Streamlit session state
|
39 |
+
st.session_state.dom_content = cleaned_content
|
40 |
+
|
41 |
+
# Display the DOM content in an expandable text box
|
42 |
+
with st.expander("View DOM Content"):
|
43 |
+
st.text_area("DOM Content", cleaned_content, height=300)
|
44 |
+
|
45 |
+
# Step 2: Parse the Content
|
46 |
+
if "dom_content" in st.session_state:
|
47 |
+
parse_description = st.text_area("Describe what you want to parse π")
|
48 |
+
|
49 |
+
if st.button("Parse Content"):
|
50 |
+
if parse_description:
|
51 |
+
st.write(f"π€ Parsing the content with {selected_model}...")
|
52 |
+
|
53 |
+
# Parse content using Groq
|
54 |
+
dom_chunks = split_dom_content(st.session_state.dom_content)
|
55 |
+
parsed_result = parse_with_groq(dom_chunks, parse_description, model=selected_model)
|
56 |
+
st.write(parsed_result)
|
57 |
+
|
58 |
+
# CSS for footer at the bottom of the sidebar
|
59 |
+
st.markdown(
|
60 |
+
"""
|
61 |
+
<style>
|
62 |
+
.footer {
|
63 |
+
position: fixed;
|
64 |
+
bottom: 0;
|
65 |
+
left: 0;
|
66 |
+
width: 100%;
|
67 |
+
background-color: #272432; /* Dark background for visibility */
|
68 |
+
color: white;
|
69 |
+
text-align: center;
|
70 |
+
padding: 10px;
|
71 |
+
font-size: 14px;
|
72 |
+
}
|
73 |
+
.sidebar .footer {
|
74 |
+
position: fixed;
|
75 |
+
bottom: 0;
|
76 |
+
}
|
77 |
+
</style>
|
78 |
+
|
79 |
+
<div class="footer">
|
80 |
+
Made with β€οΈ by Usman Yousaf π<br>
|
81 |
+
Feel free to improve and expand this app for more powerful insights! π₯
|
82 |
+
</div>
|
83 |
+
""",
|
84 |
+
unsafe_allow_html=True
|
85 |
+
)
|