In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import time
import os

session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"})

data_path = "../data"

In [2]:
def faq_crawl(url):
 data_path_faq = data_path + '/faqs'
 os.makedirs(data_path_faq, exist_ok=True)

 print(f"Crawling: {url}")

 time.sleep(10)
 response = session.get(url)
 if response.status_code != 200:
 print(f"Failed to fetch the website, {url}. Status Code: {response.status_code}")
 return
 
 filename = urlparse(url).path.strip("/").replace("/", "_") or "home"
 filepath = f"{data_path_faq}/{filename}.txt"


 soup = BeautifulSoup(response.content, "html.parser")
 
 general = soup.find_all(['h1'])
 faq_question = soup.find_all('span', class_=['uagb-question'])
 faq_content = soup.find_all('div', class_=['uagb-faq-content'])

 with open(filepath, "w", encoding="utf-8") as f:
 f.write(f"Source URL: {url}\n\n")
 for gen in general:
 text = gen.get_text(strip=True)
 f.write(f'{text}\n')
 
 for question,answer in zip(faq_content,faq_question):
 q_text = question.get_text(strip=True)
 a_text = answer.get_text(strip=True)
 if q_text and a_text:
 f.write(f"Q: {q_text}\n")
 f.write(f"A: {a_text}\n")
 f.write("---\n")

In [3]:
faq_crawl("https://aurowellness.com/faqs/")

Crawling: https://aurowellness.com/faqs/


In [4]:
def get_blog_urls_from_sitemap(sitemap_url):
 response = session.get(sitemap_url)
 soup = BeautifulSoup(response.content, features='lxml')
 
 urls = []
 for url_tag in soup.find_all('url'):
 loc_tag = url_tag.find('loc')
 if loc_tag and loc_tag.text:
 urls.append(loc_tag.text.strip())
 return urls

In [5]:
blog_urls = get_blog_urls_from_sitemap('https://aurowellness.com/post-sitemap.xml')


Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




 soup = BeautifulSoup(response.content, features='lxml')


In [6]:
def crawl_blogs(url):
 data_path_blogs = data_path + '/blogs'
 os.makedirs(data_path_blogs, exist_ok=True)

 print(f"Crawling: {url}")

 response = session.get(url)
 if response.status_code != 200:
 print(f"Failed to fetch the website, {url}. Status Code: {response.status_code}")
 return
 
 filename = urlparse(url).path.strip("/").replace("/", "_") or "home"
 filepath = f"{data_path_blogs}/{filename}.txt"

 soup = BeautifulSoup(response.content, 'html.parser')

 blog_content = soup.find('div', class_='entry-content')
 if not blog_content:
 print(f"No entry-content found in {url}")
 return

 tags = blog_content.find_all(['h1','h2', 'p'])
 with open(filepath, "w", encoding="utf-8") as f:
 f.write(f"Source URL: {url}\n\n")
 for tag in tags:
 text = tag.get_text(strip=False)
 if text:
 f.write(f'{text}\n')

In [7]:
for blog_url in blog_urls:
 crawl_blogs(blog_url)

Crawling: https://aurowellness.com/blog/
No entry-content found in https://aurowellness.com/blog/
Crawling: https://aurowellness.com/blog/tocotrienols/
Crawling: https://aurowellness.com/blog/prevent-forehead-wrinkles/
Crawling: https://aurowellness.com/blog/glutathione-lotion/
Crawling: https://aurowellness.com/blog/glutathione-supplements/
Crawling: https://aurowellness.com/blog/glutathione-for-skin-2/
Crawling: https://aurowellness.com/blog/glutathione-vitamin-c/
Crawling: https://aurowellness.com/blog/glutathione-shots/
Crawling: https://aurowellness.com/blog/oxidized-glutathione/
Crawling: https://aurowellness.com/blog/best-clean-skincare-for-rosacea/
Crawling: https://aurowellness.com/blog/nano-glutathione/
Crawling: https://aurowellness.com/blog/glutathione-for-skin/
Crawling: https://aurowellness.com/blog/glutathione-cosmetics/
Crawling: https://aurowellness.com/blog/glutathione-cream/
Crawling: https://aurowellness.com/blog/glutathione-and-cancer/
Crawling: https://aurowellnes

In [8]:
def technology_crawl(url):
 data_path_tech = data_path + '/technology'
 os.makedirs(data_path_tech, exist_ok=True)

 print(f"Crawling: {url}")
 
 response = session.get(url)
 if response.status_code != 200:
 print(f"Failed to fetch the website, {url}. Status Code: {response.status_code}")
 return
 
 filename = urlparse(url).path.strip("/").replace("/", "_") or "home"
 filepath = f"{data_path_tech}/{filename}.txt"

 soup = BeautifulSoup(response.content, 'html.parser')

 technology_content = soup.find('div', class_='entry-content')
 if not technology_content:
 print(f"No entry-content found in {url}")
 return
 
 tags = technology_content.find_all(['h1', 'h2', 'h3', 'p'])
 with open(filepath, "w", encoding="utf-8") as f:
 f.write(f"Source URL: {url}\n\n")
 for tag in tags:
 text = tag.get_text(strip=False)
 if text:
 f.write(f'{text}\n')

In [9]:
technology_crawl('https://aurowellness.com/technology/')

Crawling: https://aurowellness.com/technology/


In [10]:
def revolution_crawl(url):
 data_path_rev = data_path + '/revolution'
 os.makedirs(data_path_rev, exist_ok=True)

 print(f"Crawling: {url}")
 
 response = session.get(url)
 if response.status_code != 200:
 print(f"Failed to fetch the website, {url}. Status Code: {response.status_code}")
 return
 
 filename = urlparse(url).path.strip("/").replace("/", "_") or "home"
 filepath = f"{data_path_rev}/{filename}.txt"

 soup = BeautifulSoup(response.content, 'html.parser')
 rev_content = soup.find('div', class_='entry-content')

 excluded_classes = [
 "custom-related-products-title",
 "custom-related-products",
 "has-white-color"
 ]

 content_divs = [
 child for child in rev_content.find_all('div', recursive=False)
 if not any(cls in child.get('class', []) for cls in excluded_classes)
 ]


 if not rev_content:
 print(f"No entry-content found in {url}")
 return
 
 with open(filepath, "w", encoding="utf-8") as f:
 f.write(f"Source URL: {url}\n\n")
 for div in content_divs:
 tags = div.find_all(lambda tag: (
 tag.name in ['h1', 'h2', 'p'] or
 (tag.name == 'div' and 'uagb-tm__desc' in tag.get('class', []))
 ))
 for tag in tags:
 text = tag.get_text(strip=False)
 if text:
 f.write(f'{text}\n')

In [11]:
revolution_crawl('https://aurowellness.com/glutathione-revolution/')

Crawling: https://aurowellness.com/glutathione-revolution/
